Don't prefetch from NULL in the SSE2 fast paths.
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41  * the pixman-x64-mmx-emulation.h file contains
42  * implementations of those MMX intrinsics that
43  * are used in the SSE2 implementation.
44  */
45 #   include "pixman-x64-mmx-emulation.h"
46 #endif
47
48 #ifdef USE_SSE2
49
50 /* --------------------------------------------------------------------
51  * Locals
52  */
53
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
58
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
61
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
68
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
75
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
78
79 /* ----------------------------------------------------------------------
80  * SSE2 Inlines
81  */
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
84 {
85     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
86 }
87
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
90 {
91     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
93 }
94
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
97 {
98     __m128i r, g, b, rb, t;
99
100     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
103
104     rb = _mm_or_si128 (r, b);
105     t  = _mm_and_si128 (rb, mask_565_fix_rb);
106     t  = _mm_srli_epi32 (t, 5);
107     rb = _mm_or_si128 (rb, t);
108
109     t  = _mm_and_si128 (g, mask_565_fix_g);
110     t  = _mm_srli_epi32 (t, 6);
111     g  = _mm_or_si128 (g, t);
112
113     return _mm_or_si128 (rb, g);
114 }
115
116 static force_inline void
117 unpack_565_128_4x128 (__m128i  data,
118                       __m128i* data0,
119                       __m128i* data1,
120                       __m128i* data2,
121                       __m128i* data3)
122 {
123     __m128i lo, hi;
124
125     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
127
128     lo = unpack_565_to_8888 (lo);
129     hi = unpack_565_to_8888 (hi);
130
131     unpack_128_2x128 (lo, data0, data1);
132     unpack_128_2x128 (hi, data2, data3);
133 }
134
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
137 {
138     return (uint16_t) (((pixel >> 8) & 0xf800) |
139                        ((pixel >> 5) & 0x07e0) |
140                        ((pixel >> 3) & 0x001f));
141 }
142
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
145 {
146     return _mm_packus_epi16 (lo, hi);
147 }
148
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
151 {
152     __m128i data;
153     __m128i r, g1, g2, b;
154
155     data = pack_2x128_128 (lo, hi);
156
157     r  = _mm_and_si128 (data, mask_565_r);
158     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
161
162     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
163 }
164
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
167 {
168     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169                              pack_565_2x128_128 (*xmm2, *xmm3));
170 }
171
172 static force_inline int
173 is_opaque (__m128i x)
174 {
175     __m128i ffs = _mm_cmpeq_epi8 (x, x);
176
177     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
178 }
179
180 static force_inline int
181 is_zero (__m128i x)
182 {
183     return _mm_movemask_epi8 (
184         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
185 }
186
187 static force_inline int
188 is_transparent (__m128i x)
189 {
190     return (_mm_movemask_epi8 (
191                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
192 }
193
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
196 {
197     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
198 }
199
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
202 {
203     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204                                                      _MM_SHUFFLE (3, 3, 3, 3)),
205                                 _MM_SHUFFLE (3, 3, 3, 3));
206 }
207
208 static force_inline void
209 expand_alpha_2x128 (__m128i  data_lo,
210                     __m128i  data_hi,
211                     __m128i* alpha_lo,
212                     __m128i* alpha_hi)
213 {
214     __m128i lo, hi;
215
216     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
218
219     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
221 }
222
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i  data_lo,
225                         __m128i  data_hi,
226                         __m128i* alpha_lo,
227                         __m128i* alpha_hi)
228 {
229     __m128i lo, hi;
230
231     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
235 }
236
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
239                     __m128i* data_hi,
240                     __m128i* alpha_lo,
241                     __m128i* alpha_hi,
242                     __m128i* ret_lo,
243                     __m128i* ret_hi)
244 {
245     __m128i lo, hi;
246
247     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249     lo = _mm_adds_epu16 (lo, mask_0080);
250     hi = _mm_adds_epu16 (hi, mask_0080);
251     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
253 }
254
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
257                         __m128i* src_hi,
258                         __m128i* alpha_dst_lo,
259                         __m128i* alpha_dst_hi,
260                         __m128i* dst_lo,
261                         __m128i* dst_hi,
262                         __m128i* alpha_src_lo,
263                         __m128i* alpha_src_hi,
264                         __m128i* ret_lo,
265                         __m128i* ret_hi)
266 {
267     __m128i t1_lo, t1_hi;
268     __m128i t2_lo, t2_hi;
269
270     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
272
273     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
275 }
276
277 static force_inline void
278 negate_2x128 (__m128i  data_lo,
279               __m128i  data_hi,
280               __m128i* neg_lo,
281               __m128i* neg_hi)
282 {
283     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
285 }
286
287 static force_inline void
288 invert_colors_2x128 (__m128i  data_lo,
289                      __m128i  data_hi,
290                      __m128i* inv_lo,
291                      __m128i* inv_hi)
292 {
293     __m128i lo, hi;
294
295     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
299 }
300
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
303             __m128i* src_hi,
304             __m128i* alpha_lo,
305             __m128i* alpha_hi,
306             __m128i* dst_lo,
307             __m128i* dst_hi)
308 {
309     __m128i t1, t2;
310
311     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
312
313     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
314
315     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
317 }
318
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i  src_lo,
321                         __m128i  src_hi,
322                         __m128i* dst_lo,
323                         __m128i* dst_hi)
324 {
325     __m128i lo, hi;
326     __m128i alpha_lo, alpha_hi;
327
328     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
329
330     lo = _mm_or_si128 (alpha_lo, mask_alpha);
331     hi = _mm_or_si128 (alpha_hi, mask_alpha);
332
333     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
334
335     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
336
337     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
338 }
339
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
342                __m128i* src_hi,
343                __m128i* alpha_lo,
344                __m128i* alpha_hi,
345                __m128i* mask_lo,
346                __m128i* mask_hi,
347                __m128i* dst_lo,
348                __m128i* dst_hi)
349 {
350     __m128i s_lo, s_hi;
351     __m128i a_lo, a_hi;
352
353     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
355
356     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
357 }
358
359 static force_inline void
360 cache_prefetch (__m128i* addr)
361 {
362     _mm_prefetch ((void const*)addr, _MM_HINT_T0);
363 }
364
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
367 {
368     _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
369 }
370
371 /* prefetching NULL is very slow on some systems. don't do that. */
372
373 static force_inline void
374 maybe_prefetch (__m128i* addr)
375 {
376     if (addr)
377         cache_prefetch (addr);
378 }
379
380 static force_inline void
381 maybe_prefetch_next (__m128i* addr)
382 {
383     if (addr)
384         cache_prefetch_next (addr);
385 }
386
387 /* load 4 pixels from a 16-byte boundary aligned address */
388 static force_inline __m128i
389 load_128_aligned (__m128i* src)
390 {
391     return _mm_load_si128 (src);
392 }
393
394 /* load 4 pixels from a unaligned address */
395 static force_inline __m128i
396 load_128_unaligned (const __m128i* src)
397 {
398     return _mm_loadu_si128 (src);
399 }
400
401 /* save 4 pixels using Write Combining memory on a 16-byte
402  * boundary aligned address
403  */
404 static force_inline void
405 save_128_write_combining (__m128i* dst,
406                           __m128i  data)
407 {
408     _mm_stream_si128 (dst, data);
409 }
410
411 /* save 4 pixels on a 16-byte boundary aligned address */
412 static force_inline void
413 save_128_aligned (__m128i* dst,
414                   __m128i  data)
415 {
416     _mm_store_si128 (dst, data);
417 }
418
419 /* save 4 pixels on a unaligned address */
420 static force_inline void
421 save_128_unaligned (__m128i* dst,
422                     __m128i  data)
423 {
424     _mm_storeu_si128 (dst, data);
425 }
426
427 /* ------------------------------------------------------------------
428  * MMX inlines
429  */
430
431 static force_inline __m64
432 unpack_32_1x64 (uint32_t data)
433 {
434     return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
435 }
436
437 static force_inline __m64
438 expand_alpha_1x64 (__m64 data)
439 {
440     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
441 }
442
443 static force_inline __m64
444 expand_alpha_rev_1x64 (__m64 data)
445 {
446     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
447 }
448
449 static force_inline __m64
450 expand_pixel_8_1x64 (uint8_t data)
451 {
452     return _mm_shuffle_pi16 (
453         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
454 }
455
456 static force_inline __m64
457 pix_multiply_1x64 (__m64 data,
458                    __m64 alpha)
459 {
460     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
461                                           mask_x0080),
462                            mask_x0101);
463 }
464
465 static force_inline __m64
466 pix_add_multiply_1x64 (__m64* src,
467                        __m64* alpha_dst,
468                        __m64* dst,
469                        __m64* alpha_src)
470 {
471     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
472     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
473
474     return _mm_adds_pu8 (t1, t2);
475 }
476
477 static force_inline __m64
478 negate_1x64 (__m64 data)
479 {
480     return _mm_xor_si64 (data, mask_x00ff);
481 }
482
483 static force_inline __m64
484 invert_colors_1x64 (__m64 data)
485 {
486     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
487 }
488
489 static force_inline __m64
490 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
491 {
492     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
493 }
494
495 static force_inline __m64
496 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
497 {
498     return over_1x64 (pix_multiply_1x64 (*src, *mask),
499                       pix_multiply_1x64 (*alpha, *mask),
500                       *dst);
501 }
502
503 static force_inline __m64
504 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
505 {
506     __m64 alpha = expand_alpha_1x64 (src);
507
508     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
509                                          _mm_or_si64 (alpha, mask_x_alpha)),
510                       alpha,
511                       dst);
512 }
513
514 static force_inline uint32_t
515 pack_1x64_32 (__m64 data)
516 {
517     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
518 }
519
520 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
521  *
522  *    00RR00GG00BB
523  *
524  * --- Expanding 565 in the low word ---
525  *
526  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
527  * m = m & (01f0003f001f);
528  * m = m * (008404100840);
529  * m = m >> 8;
530  *
531  * Note the trick here - the top word is shifted by another nibble to
532  * avoid it bumping into the middle word
533  */
534 static force_inline __m64
535 expand565_16_1x64 (uint16_t pixel)
536 {
537     __m64 p;
538     __m64 t1, t2;
539
540     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
541
542     t1 = _mm_slli_si64 (p, 36 - 11);
543     t2 = _mm_slli_si64 (p, 16 - 5);
544
545     p = _mm_or_si64 (t1, p);
546     p = _mm_or_si64 (t2, p);
547     p = _mm_and_si64 (p, mask_x565_rgb);
548     p = _mm_mullo_pi16 (p, mask_x565_unpack);
549
550     return _mm_srli_pi16 (p, 8);
551 }
552
553 /* ----------------------------------------------------------------------------
554  * Compose Core transformations
555  */
556 static force_inline uint32_t
557 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
558 {
559     uint8_t a;
560     __m64 ms;
561
562     a = src >> 24;
563
564     if (a == 0xff)
565     {
566         return src;
567     }
568     else if (src)
569     {
570         ms = unpack_32_1x64 (src);
571         return pack_1x64_32 (
572             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
573     }
574
575     return dst;
576 }
577
578 static force_inline uint32_t
579 combine1 (const uint32_t *ps, const uint32_t *pm)
580 {
581     uint32_t s = *ps;
582
583     if (pm)
584     {
585         __m64 ms, mm;
586
587         mm = unpack_32_1x64 (*pm);
588         mm = expand_alpha_1x64 (mm);
589
590         ms = unpack_32_1x64 (s);
591         ms = pix_multiply_1x64 (ms, mm);
592
593         s = pack_1x64_32 (ms);
594     }
595
596     return s;
597 }
598
599 static force_inline __m128i
600 combine4 (const __m128i *ps, const __m128i *pm)
601 {
602     __m128i xmm_src_lo, xmm_src_hi;
603     __m128i xmm_msk_lo, xmm_msk_hi;
604     __m128i s;
605
606     if (pm)
607     {
608         xmm_msk_lo = load_128_unaligned (pm);
609
610         if (is_transparent (xmm_msk_lo))
611             return _mm_setzero_si128 ();
612     }
613
614     s = load_128_unaligned (ps);
615
616     if (pm)
617     {
618         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
619         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
620
621         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
622
623         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
624                             &xmm_msk_lo, &xmm_msk_hi,
625                             &xmm_src_lo, &xmm_src_hi);
626
627         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
628     }
629
630     return s;
631 }
632
633 static force_inline void
634 core_combine_over_u_sse2 (uint32_t*       pd,
635                           const uint32_t* ps,
636                           const uint32_t* pm,
637                           int             w)
638 {
639     uint32_t s, d;
640
641     __m128i xmm_dst_lo, xmm_dst_hi;
642     __m128i xmm_src_lo, xmm_src_hi;
643     __m128i xmm_alpha_lo, xmm_alpha_hi;
644
645     /* call prefetch hint to optimize cache load*/
646     cache_prefetch ((__m128i*)ps);
647     cache_prefetch ((__m128i*)pd);
648     maybe_prefetch ((__m128i*)pm);
649
650     /* Align dst on a 16-byte boundary */
651     while (w && ((unsigned long)pd & 15))
652     {
653         d = *pd;
654         s = combine1 (ps, pm);
655
656         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
657         ps++;
658         if (pm)
659             pm++;
660         w--;
661     }
662
663     /* call prefetch hint to optimize cache load*/
664     cache_prefetch ((__m128i*)ps);
665     cache_prefetch ((__m128i*)pd);
666     maybe_prefetch ((__m128i*)pm);
667
668     while (w >= 4)
669     {
670         /* fill cache line with next memory */
671         cache_prefetch_next ((__m128i*)ps);
672         cache_prefetch_next ((__m128i*)pd);
673         maybe_prefetch_next ((__m128i*)pm);
674
675         /* I'm loading unaligned because I'm not sure about
676          * the address alignment.
677          */
678         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
679
680         if (is_opaque (xmm_src_hi))
681         {
682             save_128_aligned ((__m128i*)pd, xmm_src_hi);
683         }
684         else if (!is_zero (xmm_src_hi))
685         {
686             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
687
688             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
689             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
690
691             expand_alpha_2x128 (
692                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
693
694             over_2x128 (&xmm_src_lo, &xmm_src_hi,
695                         &xmm_alpha_lo, &xmm_alpha_hi,
696                         &xmm_dst_lo, &xmm_dst_hi);
697
698             /* rebuid the 4 pixel data and save*/
699             save_128_aligned ((__m128i*)pd,
700                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
701         }
702
703         w -= 4;
704         ps += 4;
705         pd += 4;
706         if (pm)
707             pm += 4;
708     }
709
710     while (w)
711     {
712         d = *pd;
713         s = combine1 (ps, pm);
714
715         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
716         ps++;
717         if (pm)
718             pm++;
719
720         w--;
721     }
722 }
723
724 static force_inline void
725 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
726                                   const uint32_t* ps,
727                                   const uint32_t* pm,
728                                   int             w)
729 {
730     uint32_t s, d;
731
732     __m128i xmm_dst_lo, xmm_dst_hi;
733     __m128i xmm_src_lo, xmm_src_hi;
734     __m128i xmm_alpha_lo, xmm_alpha_hi;
735
736     /* call prefetch hint to optimize cache load*/
737     cache_prefetch ((__m128i*)ps);
738     cache_prefetch ((__m128i*)pd);
739     maybe_prefetch ((__m128i*)pm);
740
741     /* Align dst on a 16-byte boundary */
742     while (w &&
743            ((unsigned long)pd & 15))
744     {
745         d = *pd;
746         s = combine1 (ps, pm);
747
748         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
749         w--;
750         ps++;
751         if (pm)
752             pm++;
753     }
754
755     /* call prefetch hint to optimize cache load*/
756     cache_prefetch ((__m128i*)ps);
757     cache_prefetch ((__m128i*)pd);
758     maybe_prefetch ((__m128i*)pm);
759
760     while (w >= 4)
761     {
762         /* fill cache line with next memory */
763         cache_prefetch_next ((__m128i*)ps);
764         cache_prefetch_next ((__m128i*)pd);
765         maybe_prefetch_next ((__m128i*)pm);
766
767         /* I'm loading unaligned because I'm not sure
768          * about the address alignment.
769          */
770         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
771         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
772
773         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
774         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
775
776         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
777                             &xmm_alpha_lo, &xmm_alpha_hi);
778
779         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
780                     &xmm_alpha_lo, &xmm_alpha_hi,
781                     &xmm_src_lo, &xmm_src_hi);
782
783         /* rebuid the 4 pixel data and save*/
784         save_128_aligned ((__m128i*)pd,
785                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
786
787         w -= 4;
788         ps += 4;
789         pd += 4;
790
791         if (pm)
792             pm += 4;
793     }
794
795     while (w)
796     {
797         d = *pd;
798         s = combine1 (ps, pm);
799
800         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
801         ps++;
802         w--;
803         if (pm)
804             pm++;
805     }
806 }
807
808 static force_inline uint32_t
809 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
810 {
811     uint32_t maska = src >> 24;
812
813     if (maska == 0)
814     {
815         return 0;
816     }
817     else if (maska != 0xff)
818     {
819         return pack_1x64_32 (
820             pix_multiply_1x64 (unpack_32_1x64 (dst),
821                                expand_alpha_1x64 (unpack_32_1x64 (src))));
822     }
823
824     return dst;
825 }
826
827 static force_inline void
828 core_combine_in_u_sse2 (uint32_t*       pd,
829                         const uint32_t* ps,
830                         const uint32_t* pm,
831                         int             w)
832 {
833     uint32_t s, d;
834
835     __m128i xmm_src_lo, xmm_src_hi;
836     __m128i xmm_dst_lo, xmm_dst_hi;
837
838     /* call prefetch hint to optimize cache load*/
839     cache_prefetch ((__m128i*)ps);
840     cache_prefetch ((__m128i*)pd);
841     maybe_prefetch ((__m128i*)pm);
842
843     while (w && ((unsigned long) pd & 15))
844     {
845         s = combine1 (ps, pm);
846         d = *pd;
847
848         *pd++ = core_combine_in_u_pixelsse2 (d, s);
849         w--;
850         ps++;
851         if (pm)
852             pm++;
853     }
854
855     /* call prefetch hint to optimize cache load*/
856     cache_prefetch ((__m128i*)ps);
857     cache_prefetch ((__m128i*)pd);
858     maybe_prefetch ((__m128i*)pm);
859
860     while (w >= 4)
861     {
862         /* fill cache line with next memory */
863         cache_prefetch_next ((__m128i*)ps);
864         cache_prefetch_next ((__m128i*)pd);
865         maybe_prefetch_next ((__m128i*)pm);
866
867         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
868         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
869
870         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
871         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
872
873         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
874         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
875                             &xmm_dst_lo, &xmm_dst_hi,
876                             &xmm_dst_lo, &xmm_dst_hi);
877
878         save_128_aligned ((__m128i*)pd,
879                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
880
881         ps += 4;
882         pd += 4;
883         w -= 4;
884         if (pm)
885             pm += 4;
886     }
887
888     while (w)
889     {
890         s = combine1 (ps, pm);
891         d = *pd;
892
893         *pd++ = core_combine_in_u_pixelsse2 (d, s);
894         w--;
895         ps++;
896         if (pm)
897             pm++;
898     }
899 }
900
901 static force_inline void
902 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
903                                 const uint32_t* ps,
904                                 const uint32_t *pm,
905                                 int             w)
906 {
907     uint32_t s, d;
908
909     __m128i xmm_src_lo, xmm_src_hi;
910     __m128i xmm_dst_lo, xmm_dst_hi;
911
912     /* call prefetch hint to optimize cache load*/
913     cache_prefetch ((__m128i*)ps);
914     cache_prefetch ((__m128i*)pd);
915     maybe_prefetch ((__m128i*)pm);
916
917     while (w && ((unsigned long) pd & 15))
918     {
919         s = combine1 (ps, pm);
920         d = *pd;
921
922         *pd++ = core_combine_in_u_pixelsse2 (s, d);
923         ps++;
924         w--;
925         if (pm)
926             pm++;
927     }
928
929     /* call prefetch hint to optimize cache load*/
930     cache_prefetch ((__m128i*)ps);
931     cache_prefetch ((__m128i*)pd);
932     maybe_prefetch ((__m128i*)pm);
933
934     while (w >= 4)
935     {
936         /* fill cache line with next memory */
937         cache_prefetch_next ((__m128i*)ps);
938         cache_prefetch_next ((__m128i*)pd);
939         maybe_prefetch_next ((__m128i*)pm);
940
941         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
942         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
943
944         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
945         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
946
947         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
948         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
949                             &xmm_src_lo, &xmm_src_hi,
950                             &xmm_dst_lo, &xmm_dst_hi);
951
952         save_128_aligned (
953             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
954
955         ps += 4;
956         pd += 4;
957         w -= 4;
958         if (pm)
959             pm += 4;
960     }
961
962     while (w)
963     {
964         s = combine1 (ps, pm);
965         d = *pd;
966
967         *pd++ = core_combine_in_u_pixelsse2 (s, d);
968         w--;
969         ps++;
970         if (pm)
971             pm++;
972     }
973 }
974
975 static force_inline void
976 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
977                                  const uint32_t* ps,
978                                  const uint32_t* pm,
979                                  int             w)
980 {
981     /* call prefetch hint to optimize cache load*/
982     cache_prefetch ((__m128i*)ps);
983     cache_prefetch ((__m128i*)pd);
984     maybe_prefetch ((__m128i*)pm);
985
986     while (w && ((unsigned long) pd & 15))
987     {
988         uint32_t s = combine1 (ps, pm);
989         uint32_t d = *pd;
990
991         *pd++ = pack_1x64_32 (
992             pix_multiply_1x64 (
993                 unpack_32_1x64 (d), negate_1x64 (
994                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
995         
996         if (pm)
997             pm++;
998         ps++;
999         w--;
1000     }
1001
1002     /* call prefetch hint to optimize cache load*/
1003     cache_prefetch ((__m128i*)ps);
1004     cache_prefetch ((__m128i*)pd);
1005     maybe_prefetch ((__m128i*)pm);
1006
1007     while (w >= 4)
1008     {
1009         __m128i xmm_src_lo, xmm_src_hi;
1010         __m128i xmm_dst_lo, xmm_dst_hi;
1011
1012         /* fill cache line with next memory */
1013         cache_prefetch_next ((__m128i*)ps);
1014         cache_prefetch_next ((__m128i*)pd);
1015         maybe_prefetch_next ((__m128i*)pm);
1016
1017         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1018         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1019
1020         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1021         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1022
1023         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1024         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1025
1026         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1027                             &xmm_src_lo, &xmm_src_hi,
1028                             &xmm_dst_lo, &xmm_dst_hi);
1029
1030         save_128_aligned (
1031             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1032
1033         ps += 4;
1034         pd += 4;
1035         if (pm)
1036             pm += 4;
1037
1038         w -= 4;
1039     }
1040
1041     while (w)
1042     {
1043         uint32_t s = combine1 (ps, pm);
1044         uint32_t d = *pd;
1045
1046         *pd++ = pack_1x64_32 (
1047             pix_multiply_1x64 (
1048                 unpack_32_1x64 (d), negate_1x64 (
1049                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1050         ps++;
1051         if (pm)
1052             pm++;
1053         w--;
1054     }
1055 }
1056
1057 static force_inline void
1058 core_combine_out_u_sse2 (uint32_t*       pd,
1059                          const uint32_t* ps,
1060                          const uint32_t* pm,
1061                          int             w)
1062 {
1063     /* call prefetch hint to optimize cache load*/
1064     cache_prefetch ((__m128i*)ps);
1065     cache_prefetch ((__m128i*)pd);
1066     maybe_prefetch ((__m128i*)pm);
1067
1068     while (w && ((unsigned long) pd & 15))
1069     {
1070         uint32_t s = combine1 (ps, pm);
1071         uint32_t d = *pd;
1072
1073         *pd++ = pack_1x64_32 (
1074             pix_multiply_1x64 (
1075                 unpack_32_1x64 (s), negate_1x64 (
1076                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1077         w--;
1078         ps++;
1079         if (pm)
1080             pm++;
1081     }
1082
1083     /* call prefetch hint to optimize cache load*/
1084     cache_prefetch ((__m128i*)ps);
1085     cache_prefetch ((__m128i*)pd);
1086     maybe_prefetch ((__m128i*)pm);
1087
1088     while (w >= 4)
1089     {
1090         __m128i xmm_src_lo, xmm_src_hi;
1091         __m128i xmm_dst_lo, xmm_dst_hi;
1092
1093         /* fill cache line with next memory */
1094         cache_prefetch_next ((__m128i*)ps);
1095         cache_prefetch_next ((__m128i*)pd);
1096         maybe_prefetch_next ((__m128i*)pm);
1097
1098         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1099         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1100
1101         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1102         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1103
1104         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1105         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1106
1107         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1108                             &xmm_dst_lo, &xmm_dst_hi,
1109                             &xmm_dst_lo, &xmm_dst_hi);
1110
1111         save_128_aligned (
1112             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1113
1114         ps += 4;
1115         pd += 4;
1116         w -= 4;
1117         if (pm)
1118             pm += 4;
1119     }
1120
1121     while (w)
1122     {
1123         uint32_t s = combine1 (ps, pm);
1124         uint32_t d = *pd;
1125
1126         *pd++ = pack_1x64_32 (
1127             pix_multiply_1x64 (
1128                 unpack_32_1x64 (s), negate_1x64 (
1129                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1130         w--;
1131         ps++;
1132         if (pm)
1133             pm++;
1134     }
1135 }
1136
1137 static force_inline uint32_t
1138 core_combine_atop_u_pixel_sse2 (uint32_t src,
1139                                 uint32_t dst)
1140 {
1141     __m64 s = unpack_32_1x64 (src);
1142     __m64 d = unpack_32_1x64 (dst);
1143
1144     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1145     __m64 da = expand_alpha_1x64 (d);
1146
1147     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1148 }
1149
1150 static force_inline void
1151 core_combine_atop_u_sse2 (uint32_t*       pd,
1152                           const uint32_t* ps,
1153                           const uint32_t* pm,
1154                           int             w)
1155 {
1156     uint32_t s, d;
1157
1158     __m128i xmm_src_lo, xmm_src_hi;
1159     __m128i xmm_dst_lo, xmm_dst_hi;
1160     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1161     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1162
1163     /* call prefetch hint to optimize cache load*/
1164     cache_prefetch ((__m128i*)ps);
1165     cache_prefetch ((__m128i*)pd);
1166     maybe_prefetch ((__m128i*)pm);
1167
1168     while (w && ((unsigned long) pd & 15))
1169     {
1170         s = combine1 (ps, pm);
1171         d = *pd;
1172
1173         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1174         w--;
1175         ps++;
1176         if (pm)
1177             pm++;
1178     }
1179
1180     /* call prefetch hint to optimize cache load*/
1181     cache_prefetch ((__m128i*)ps);
1182     cache_prefetch ((__m128i*)pd);
1183     maybe_prefetch ((__m128i*)pm);
1184
1185     while (w >= 4)
1186     {
1187         /* fill cache line with next memory */
1188         cache_prefetch_next ((__m128i*)ps);
1189         cache_prefetch_next ((__m128i*)pd);
1190         maybe_prefetch_next ((__m128i*)pm);
1191
1192         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1193         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1194
1195         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1196         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1197
1198         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1199                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1200         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1201                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1202
1203         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1204                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1205
1206         pix_add_multiply_2x128 (
1207             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1208             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1209             &xmm_dst_lo, &xmm_dst_hi);
1210
1211         save_128_aligned (
1212             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1213
1214         ps += 4;
1215         pd += 4;
1216         w -= 4;
1217         if (pm)
1218             pm += 4;
1219     }
1220
1221     while (w)
1222     {
1223         s = combine1 (ps, pm);
1224         d = *pd;
1225
1226         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1227         w--;
1228         ps++;
1229         if (pm)
1230             pm++;
1231     }
1232 }
1233
1234 static force_inline uint32_t
1235 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1236                                         uint32_t dst)
1237 {
1238     __m64 s = unpack_32_1x64 (src);
1239     __m64 d = unpack_32_1x64 (dst);
1240
1241     __m64 sa = expand_alpha_1x64 (s);
1242     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1243
1244     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1245 }
1246
1247 static force_inline void
1248 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1249                                   const uint32_t* ps,
1250                                   const uint32_t* pm,
1251                                   int             w)
1252 {
1253     uint32_t s, d;
1254
1255     __m128i xmm_src_lo, xmm_src_hi;
1256     __m128i xmm_dst_lo, xmm_dst_hi;
1257     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1258     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1259
1260     /* call prefetch hint to optimize cache load*/
1261     cache_prefetch ((__m128i*)ps);
1262     cache_prefetch ((__m128i*)pd);
1263     maybe_prefetch ((__m128i*)pm);
1264
1265     while (w && ((unsigned long) pd & 15))
1266     {
1267         s = combine1 (ps, pm);
1268         d = *pd;
1269
1270         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1271         ps++;
1272         w--;
1273         if (pm)
1274             pm++;
1275     }
1276
1277     /* call prefetch hint to optimize cache load*/
1278     cache_prefetch ((__m128i*)ps);
1279     cache_prefetch ((__m128i*)pd);
1280     maybe_prefetch ((__m128i*)pm);
1281
1282     while (w >= 4)
1283     {
1284         /* fill cache line with next memory */
1285         cache_prefetch_next ((__m128i*)ps);
1286         cache_prefetch_next ((__m128i*)pd);
1287         maybe_prefetch_next ((__m128i*)pm);
1288
1289         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1290         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1291
1292         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1293         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1294
1295         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1296                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1297         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1298                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1299
1300         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1301                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1302
1303         pix_add_multiply_2x128 (
1304             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1305             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1306             &xmm_dst_lo, &xmm_dst_hi);
1307
1308         save_128_aligned (
1309             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1310
1311         ps += 4;
1312         pd += 4;
1313         w -= 4;
1314         if (pm)
1315             pm += 4;
1316     }
1317
1318     while (w)
1319     {
1320         s = combine1 (ps, pm);
1321         d = *pd;
1322
1323         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1324         ps++;
1325         w--;
1326         if (pm)
1327             pm++;
1328     }
1329 }
1330
1331 static force_inline uint32_t
1332 core_combine_xor_u_pixel_sse2 (uint32_t src,
1333                                uint32_t dst)
1334 {
1335     __m64 s = unpack_32_1x64 (src);
1336     __m64 d = unpack_32_1x64 (dst);
1337
1338     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1339     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1340
1341     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1342 }
1343
1344 static force_inline void
1345 core_combine_xor_u_sse2 (uint32_t*       dst,
1346                          const uint32_t* src,
1347                          const uint32_t *mask,
1348                          int             width)
1349 {
1350     int w = width;
1351     uint32_t s, d;
1352     uint32_t* pd = dst;
1353     const uint32_t* ps = src;
1354     const uint32_t* pm = mask;
1355
1356     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1357     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1358     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1359     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1360
1361     /* call prefetch hint to optimize cache load*/
1362     cache_prefetch ((__m128i*)ps);
1363     cache_prefetch ((__m128i*)pd);
1364     maybe_prefetch ((__m128i*)pm);
1365
1366     while (w && ((unsigned long) pd & 15))
1367     {
1368         s = combine1 (ps, pm);
1369         d = *pd;
1370
1371         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1372         w--;
1373         ps++;
1374         if (pm)
1375             pm++;
1376     }
1377
1378     /* call prefetch hint to optimize cache load*/
1379     cache_prefetch ((__m128i*)ps);
1380     cache_prefetch ((__m128i*)pd);
1381     maybe_prefetch ((__m128i*)pm);
1382
1383     while (w >= 4)
1384     {
1385         /* fill cache line with next memory */
1386         cache_prefetch_next ((__m128i*)ps);
1387         cache_prefetch_next ((__m128i*)pd);
1388         maybe_prefetch_next ((__m128i*)pm);
1389
1390         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1391         xmm_dst = load_128_aligned ((__m128i*) pd);
1392
1393         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1394         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1395
1396         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1397                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1398         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1399                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1400
1401         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1402                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1403         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1404                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1405
1406         pix_add_multiply_2x128 (
1407             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1408             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1409             &xmm_dst_lo, &xmm_dst_hi);
1410
1411         save_128_aligned (
1412             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1413
1414         ps += 4;
1415         pd += 4;
1416         w -= 4;
1417         if (pm)
1418             pm += 4;
1419     }
1420
1421     while (w)
1422     {
1423         s = combine1 (ps, pm);
1424         d = *pd;
1425
1426         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1427         w--;
1428         ps++;
1429         if (pm)
1430             pm++;
1431     }
1432 }
1433
1434 static force_inline void
1435 core_combine_add_u_sse2 (uint32_t*       dst,
1436                          const uint32_t* src,
1437                          const uint32_t* mask,
1438                          int             width)
1439 {
1440     int w = width;
1441     uint32_t s, d;
1442     uint32_t* pd = dst;
1443     const uint32_t* ps = src;
1444     const uint32_t* pm = mask;
1445
1446     /* call prefetch hint to optimize cache load*/
1447     cache_prefetch ((__m128i*)ps);
1448     cache_prefetch ((__m128i*)pd);
1449     maybe_prefetch ((__m128i*)pm);
1450
1451     while (w && (unsigned long)pd & 15)
1452     {
1453         s = combine1 (ps, pm);
1454         d = *pd;
1455
1456         ps++;
1457         if (pm)
1458             pm++;
1459         *pd++ = _mm_cvtsi64_si32 (
1460             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1461         w--;
1462     }
1463
1464     /* call prefetch hint to optimize cache load*/
1465     cache_prefetch ((__m128i*)ps);
1466     cache_prefetch ((__m128i*)pd);
1467     maybe_prefetch ((__m128i*)pm);
1468
1469     while (w >= 4)
1470     {
1471         __m128i s;
1472
1473         /* fill cache line with next memory */
1474         cache_prefetch_next ((__m128i*)ps);
1475         cache_prefetch_next ((__m128i*)pd);
1476         maybe_prefetch_next ((__m128i*)pm);
1477
1478         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1479
1480         save_128_aligned (
1481             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1482
1483         pd += 4;
1484         ps += 4;
1485         if (pm)
1486             pm += 4;
1487         w -= 4;
1488     }
1489
1490     while (w--)
1491     {
1492         s = combine1 (ps, pm);
1493         d = *pd;
1494
1495         ps++;
1496         *pd++ = _mm_cvtsi64_si32 (
1497             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1498         if (pm)
1499             pm++;
1500     }
1501 }
1502
1503 static force_inline uint32_t
1504 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1505                                     uint32_t dst)
1506 {
1507     __m64 ms = unpack_32_1x64 (src);
1508     __m64 md = unpack_32_1x64 (dst);
1509     uint32_t sa = src >> 24;
1510     uint32_t da = ~dst >> 24;
1511
1512     if (sa > da)
1513     {
1514         ms = pix_multiply_1x64 (
1515             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1516     }
1517
1518     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1519 }
1520
1521 static force_inline void
1522 core_combine_saturate_u_sse2 (uint32_t *      pd,
1523                               const uint32_t *ps,
1524                               const uint32_t *pm,
1525                               int             w)
1526 {
1527     uint32_t s, d;
1528
1529     uint32_t pack_cmp;
1530     __m128i xmm_src, xmm_dst;
1531
1532     /* call prefetch hint to optimize cache load*/
1533     cache_prefetch ((__m128i*)ps);
1534     cache_prefetch ((__m128i*)pd);
1535     maybe_prefetch ((__m128i*)pm);
1536
1537     while (w && (unsigned long)pd & 15)
1538     {
1539         s = combine1 (ps, pm);
1540         d = *pd;
1541
1542         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1543         w--;
1544         ps++;
1545         if (pm)
1546             pm++;
1547     }
1548
1549     /* call prefetch hint to optimize cache load*/
1550     cache_prefetch ((__m128i*)ps);
1551     cache_prefetch ((__m128i*)pd);
1552     maybe_prefetch ((__m128i*)pm);
1553
1554     while (w >= 4)
1555     {
1556         /* fill cache line with next memory */
1557         cache_prefetch_next ((__m128i*)ps);
1558         cache_prefetch_next ((__m128i*)pd);
1559         maybe_prefetch_next ((__m128i*)pm);
1560
1561         xmm_dst = load_128_aligned  ((__m128i*)pd);
1562         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1563
1564         pack_cmp = _mm_movemask_epi8 (
1565             _mm_cmpgt_epi32 (
1566                 _mm_srli_epi32 (xmm_src, 24),
1567                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1568
1569         /* if some alpha src is grater than respective ~alpha dst */
1570         if (pack_cmp)
1571         {
1572             s = combine1 (ps++, pm);
1573             d = *pd;
1574             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1575             if (pm)
1576                 pm++;
1577
1578             s = combine1 (ps++, pm);
1579             d = *pd;
1580             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1581             if (pm)
1582                 pm++;
1583
1584             s = combine1 (ps++, pm);
1585             d = *pd;
1586             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1587             if (pm)
1588                 pm++;
1589
1590             s = combine1 (ps++, pm);
1591             d = *pd;
1592             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1593             if (pm)
1594                 pm++;
1595         }
1596         else
1597         {
1598             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1599
1600             pd += 4;
1601             ps += 4;
1602             if (pm)
1603                 pm += 4;
1604         }
1605
1606         w -= 4;
1607     }
1608
1609     while (w--)
1610     {
1611         s = combine1 (ps, pm);
1612         d = *pd;
1613
1614         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1615         ps++;
1616         if (pm)
1617             pm++;
1618     }
1619 }
1620
1621 static force_inline void
1622 core_combine_src_ca_sse2 (uint32_t*       pd,
1623                           const uint32_t* ps,
1624                           const uint32_t *pm,
1625                           int             w)
1626 {
1627     uint32_t s, m;
1628
1629     __m128i xmm_src_lo, xmm_src_hi;
1630     __m128i xmm_mask_lo, xmm_mask_hi;
1631     __m128i xmm_dst_lo, xmm_dst_hi;
1632
1633     /* call prefetch hint to optimize cache load*/
1634     cache_prefetch ((__m128i*)ps);
1635     cache_prefetch ((__m128i*)pd);
1636     cache_prefetch ((__m128i*)pm);
1637
1638     while (w && (unsigned long)pd & 15)
1639     {
1640         s = *ps++;
1641         m = *pm++;
1642         *pd++ = pack_1x64_32 (
1643             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1644         w--;
1645     }
1646
1647     /* call prefetch hint to optimize cache load*/
1648     cache_prefetch ((__m128i*)ps);
1649     cache_prefetch ((__m128i*)pd);
1650     cache_prefetch ((__m128i*)pm);
1651
1652     while (w >= 4)
1653     {
1654         /* fill cache line with next memory */
1655         cache_prefetch_next ((__m128i*)ps);
1656         cache_prefetch_next ((__m128i*)pd);
1657         cache_prefetch_next ((__m128i*)pm);
1658
1659         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1660         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1661
1662         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1663         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1664
1665         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1666                             &xmm_mask_lo, &xmm_mask_hi,
1667                             &xmm_dst_lo, &xmm_dst_hi);
1668
1669         save_128_aligned (
1670             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1671
1672         ps += 4;
1673         pd += 4;
1674         pm += 4;
1675         w -= 4;
1676     }
1677
1678     while (w)
1679     {
1680         s = *ps++;
1681         m = *pm++;
1682         *pd++ = pack_1x64_32 (
1683             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1684         w--;
1685     }
1686 }
1687
1688 static force_inline uint32_t
1689 core_combine_over_ca_pixel_sse2 (uint32_t src,
1690                                  uint32_t mask,
1691                                  uint32_t dst)
1692 {
1693     __m64 s = unpack_32_1x64 (src);
1694     __m64 expAlpha = expand_alpha_1x64 (s);
1695     __m64 unpk_mask = unpack_32_1x64 (mask);
1696     __m64 unpk_dst  = unpack_32_1x64 (dst);
1697
1698     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1699 }
1700
1701 static force_inline void
1702 core_combine_over_ca_sse2 (uint32_t*       pd,
1703                            const uint32_t* ps,
1704                            const uint32_t *pm,
1705                            int             w)
1706 {
1707     uint32_t s, m, d;
1708
1709     __m128i xmm_alpha_lo, xmm_alpha_hi;
1710     __m128i xmm_src_lo, xmm_src_hi;
1711     __m128i xmm_dst_lo, xmm_dst_hi;
1712     __m128i xmm_mask_lo, xmm_mask_hi;
1713
1714     /* call prefetch hint to optimize cache load*/
1715     cache_prefetch ((__m128i*)ps);
1716     cache_prefetch ((__m128i*)pd);
1717     cache_prefetch ((__m128i*)pm);
1718
1719     while (w && (unsigned long)pd & 15)
1720     {
1721         s = *ps++;
1722         m = *pm++;
1723         d = *pd;
1724
1725         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1726         w--;
1727     }
1728
1729     /* call prefetch hint to optimize cache load*/
1730     cache_prefetch ((__m128i*)ps);
1731     cache_prefetch ((__m128i*)pd);
1732     cache_prefetch ((__m128i*)pm);
1733
1734     while (w >= 4)
1735     {
1736         /* fill cache line with next memory */
1737         cache_prefetch_next ((__m128i*)ps);
1738         cache_prefetch_next ((__m128i*)pd);
1739         cache_prefetch_next ((__m128i*)pm);
1740
1741         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1742         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1743         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1744
1745         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1746         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1747         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1748
1749         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1750                             &xmm_alpha_lo, &xmm_alpha_hi);
1751
1752         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1753                        &xmm_alpha_lo, &xmm_alpha_hi,
1754                        &xmm_mask_lo, &xmm_mask_hi,
1755                        &xmm_dst_lo, &xmm_dst_hi);
1756
1757         save_128_aligned (
1758             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1759
1760         ps += 4;
1761         pd += 4;
1762         pm += 4;
1763         w -= 4;
1764     }
1765
1766     while (w)
1767     {
1768         s = *ps++;
1769         m = *pm++;
1770         d = *pd;
1771
1772         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1773         w--;
1774     }
1775 }
1776
1777 static force_inline uint32_t
1778 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1779                                          uint32_t mask,
1780                                          uint32_t dst)
1781 {
1782     __m64 d = unpack_32_1x64 (dst);
1783
1784     return pack_1x64_32 (
1785         over_1x64 (d, expand_alpha_1x64 (d),
1786                    pix_multiply_1x64 (unpack_32_1x64 (src),
1787                                       unpack_32_1x64 (mask))));
1788 }
1789
1790 static force_inline void
1791 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1792                                    const uint32_t* ps,
1793                                    const uint32_t *pm,
1794                                    int             w)
1795 {
1796     uint32_t s, m, d;
1797
1798     __m128i xmm_alpha_lo, xmm_alpha_hi;
1799     __m128i xmm_src_lo, xmm_src_hi;
1800     __m128i xmm_dst_lo, xmm_dst_hi;
1801     __m128i xmm_mask_lo, xmm_mask_hi;
1802
1803     /* call prefetch hint to optimize cache load*/
1804     cache_prefetch ((__m128i*)ps);
1805     cache_prefetch ((__m128i*)pd);
1806     cache_prefetch ((__m128i*)pm);
1807
1808     while (w && (unsigned long)pd & 15)
1809     {
1810         s = *ps++;
1811         m = *pm++;
1812         d = *pd;
1813
1814         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1815         w--;
1816     }
1817
1818     /* call prefetch hint to optimize cache load*/
1819     cache_prefetch ((__m128i*)ps);
1820     cache_prefetch ((__m128i*)pd);
1821     cache_prefetch ((__m128i*)pm);
1822
1823     while (w >= 4)
1824     {
1825         /* fill cache line with next memory */
1826         cache_prefetch_next ((__m128i*)ps);
1827         cache_prefetch_next ((__m128i*)pd);
1828         cache_prefetch_next ((__m128i*)pm);
1829
1830         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1831         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1832         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1833
1834         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1835         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1836         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1837
1838         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1839                             &xmm_alpha_lo, &xmm_alpha_hi);
1840         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1841                             &xmm_mask_lo, &xmm_mask_hi,
1842                             &xmm_mask_lo, &xmm_mask_hi);
1843
1844         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1845                     &xmm_alpha_lo, &xmm_alpha_hi,
1846                     &xmm_mask_lo, &xmm_mask_hi);
1847
1848         save_128_aligned (
1849             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1850
1851         ps += 4;
1852         pd += 4;
1853         pm += 4;
1854         w -= 4;
1855     }
1856
1857     while (w)
1858     {
1859         s = *ps++;
1860         m = *pm++;
1861         d = *pd;
1862
1863         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1864         w--;
1865     }
1866 }
1867
1868 static force_inline void
1869 core_combine_in_ca_sse2 (uint32_t *      pd,
1870                          const uint32_t *ps,
1871                          const uint32_t *pm,
1872                          int             w)
1873 {
1874     uint32_t s, m, d;
1875
1876     __m128i xmm_alpha_lo, xmm_alpha_hi;
1877     __m128i xmm_src_lo, xmm_src_hi;
1878     __m128i xmm_dst_lo, xmm_dst_hi;
1879     __m128i xmm_mask_lo, xmm_mask_hi;
1880
1881     /* call prefetch hint to optimize cache load*/
1882     cache_prefetch ((__m128i*)ps);
1883     cache_prefetch ((__m128i*)pd);
1884     cache_prefetch ((__m128i*)pm);
1885
1886     while (w && (unsigned long)pd & 15)
1887     {
1888         s = *ps++;
1889         m = *pm++;
1890         d = *pd;
1891
1892         *pd++ = pack_1x64_32 (
1893             pix_multiply_1x64 (
1894                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1895                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1896
1897         w--;
1898     }
1899
1900     /* call prefetch hint to optimize cache load*/
1901     cache_prefetch ((__m128i*)ps);
1902     cache_prefetch ((__m128i*)pd);
1903     cache_prefetch ((__m128i*)pm);
1904
1905     while (w >= 4)
1906     {
1907         /* fill cache line with next memory */
1908         cache_prefetch_next ((__m128i*)ps);
1909         cache_prefetch_next ((__m128i*)pd);
1910         cache_prefetch_next ((__m128i*)pm);
1911
1912         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1913         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1914         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1915
1916         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1917         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1918         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1919
1920         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1921                             &xmm_alpha_lo, &xmm_alpha_hi);
1922
1923         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1924                             &xmm_mask_lo, &xmm_mask_hi,
1925                             &xmm_dst_lo, &xmm_dst_hi);
1926
1927         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1928                             &xmm_alpha_lo, &xmm_alpha_hi,
1929                             &xmm_dst_lo, &xmm_dst_hi);
1930
1931         save_128_aligned (
1932             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1933
1934         ps += 4;
1935         pd += 4;
1936         pm += 4;
1937         w -= 4;
1938     }
1939
1940     while (w)
1941     {
1942         s = *ps++;
1943         m = *pm++;
1944         d = *pd;
1945
1946         *pd++ = pack_1x64_32 (
1947             pix_multiply_1x64 (
1948                 pix_multiply_1x64 (
1949                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1950                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1951
1952         w--;
1953     }
1954 }
1955
1956 static force_inline void
1957 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1958                                  const uint32_t *ps,
1959                                  const uint32_t *pm,
1960                                  int             w)
1961 {
1962     uint32_t s, m, d;
1963
1964     __m128i xmm_alpha_lo, xmm_alpha_hi;
1965     __m128i xmm_src_lo, xmm_src_hi;
1966     __m128i xmm_dst_lo, xmm_dst_hi;
1967     __m128i xmm_mask_lo, xmm_mask_hi;
1968
1969     /* call prefetch hint to optimize cache load*/
1970     cache_prefetch ((__m128i*)ps);
1971     cache_prefetch ((__m128i*)pd);
1972     cache_prefetch ((__m128i*)pm);
1973
1974     while (w && (unsigned long)pd & 15)
1975     {
1976         s = *ps++;
1977         m = *pm++;
1978         d = *pd;
1979
1980         *pd++ = pack_1x64_32 (
1981             pix_multiply_1x64 (
1982                 unpack_32_1x64 (d),
1983                 pix_multiply_1x64 (unpack_32_1x64 (m),
1984                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1985         w--;
1986     }
1987
1988     /* call prefetch hint to optimize cache load*/
1989     cache_prefetch ((__m128i*)ps);
1990     cache_prefetch ((__m128i*)pd);
1991     cache_prefetch ((__m128i*)pm);
1992
1993     while (w >= 4)
1994     {
1995         /* fill cache line with next memory */
1996         cache_prefetch_next ((__m128i*)ps);
1997         cache_prefetch_next ((__m128i*)pd);
1998         cache_prefetch_next ((__m128i*)pm);
1999
2000         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2001         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2002         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2003
2004         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2005         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2006         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2007
2008         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2009                             &xmm_alpha_lo, &xmm_alpha_hi);
2010         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2011                             &xmm_alpha_lo, &xmm_alpha_hi,
2012                             &xmm_alpha_lo, &xmm_alpha_hi);
2013
2014         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2015                             &xmm_alpha_lo, &xmm_alpha_hi,
2016                             &xmm_dst_lo, &xmm_dst_hi);
2017
2018         save_128_aligned (
2019             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2020
2021         ps += 4;
2022         pd += 4;
2023         pm += 4;
2024         w -= 4;
2025     }
2026
2027     while (w)
2028     {
2029         s = *ps++;
2030         m = *pm++;
2031         d = *pd;
2032
2033         *pd++ = pack_1x64_32 (
2034             pix_multiply_1x64 (
2035                 unpack_32_1x64 (d),
2036                 pix_multiply_1x64 (unpack_32_1x64 (m),
2037                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
2038         w--;
2039     }
2040 }
2041
2042 static force_inline void
2043 core_combine_out_ca_sse2 (uint32_t *      pd,
2044                           const uint32_t *ps,
2045                           const uint32_t *pm,
2046                           int             w)
2047 {
2048     uint32_t s, m, d;
2049
2050     __m128i xmm_alpha_lo, xmm_alpha_hi;
2051     __m128i xmm_src_lo, xmm_src_hi;
2052     __m128i xmm_dst_lo, xmm_dst_hi;
2053     __m128i xmm_mask_lo, xmm_mask_hi;
2054
2055     /* call prefetch hint to optimize cache load*/
2056     cache_prefetch ((__m128i*)ps);
2057     cache_prefetch ((__m128i*)pd);
2058     cache_prefetch ((__m128i*)pm);
2059
2060     while (w && (unsigned long)pd & 15)
2061     {
2062         s = *ps++;
2063         m = *pm++;
2064         d = *pd;
2065
2066         *pd++ = pack_1x64_32 (
2067             pix_multiply_1x64 (
2068                 pix_multiply_1x64 (
2069                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2070                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2071         w--;
2072     }
2073
2074     /* call prefetch hint to optimize cache load*/
2075     cache_prefetch ((__m128i*)ps);
2076     cache_prefetch ((__m128i*)pd);
2077     cache_prefetch ((__m128i*)pm);
2078
2079     while (w >= 4)
2080     {
2081         /* fill cache line with next memory */
2082         cache_prefetch_next ((__m128i*)ps);
2083         cache_prefetch_next ((__m128i*)pd);
2084         cache_prefetch_next ((__m128i*)pm);
2085
2086         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2087         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2088         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2089
2090         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2091         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2092         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2093
2094         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2095                             &xmm_alpha_lo, &xmm_alpha_hi);
2096         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2097                       &xmm_alpha_lo, &xmm_alpha_hi);
2098
2099         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2100                             &xmm_mask_lo, &xmm_mask_hi,
2101                             &xmm_dst_lo, &xmm_dst_hi);
2102         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2103                             &xmm_alpha_lo, &xmm_alpha_hi,
2104                             &xmm_dst_lo, &xmm_dst_hi);
2105
2106         save_128_aligned (
2107             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2108
2109         ps += 4;
2110         pd += 4;
2111         pm += 4;
2112         w -= 4;
2113     }
2114
2115     while (w)
2116     {
2117         s = *ps++;
2118         m = *pm++;
2119         d = *pd;
2120
2121         *pd++ = pack_1x64_32 (
2122             pix_multiply_1x64 (
2123                 pix_multiply_1x64 (
2124                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2125                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2126
2127         w--;
2128     }
2129 }
2130
2131 static force_inline void
2132 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
2133                                   const uint32_t *ps,
2134                                   const uint32_t *pm,
2135                                   int             w)
2136 {
2137     uint32_t s, m, d;
2138
2139     __m128i xmm_alpha_lo, xmm_alpha_hi;
2140     __m128i xmm_src_lo, xmm_src_hi;
2141     __m128i xmm_dst_lo, xmm_dst_hi;
2142     __m128i xmm_mask_lo, xmm_mask_hi;
2143
2144     /* call prefetch hint to optimize cache load*/
2145     cache_prefetch ((__m128i*)ps);
2146     cache_prefetch ((__m128i*)pd);
2147     cache_prefetch ((__m128i*)pm);
2148
2149     while (w && (unsigned long)pd & 15)
2150     {
2151         s = *ps++;
2152         m = *pm++;
2153         d = *pd;
2154
2155         *pd++ = pack_1x64_32 (
2156             pix_multiply_1x64 (
2157                 unpack_32_1x64 (d),
2158                 negate_1x64 (pix_multiply_1x64 (
2159                                  unpack_32_1x64 (m),
2160                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2161         w--;
2162     }
2163
2164     /* call prefetch hint to optimize cache load*/
2165     cache_prefetch ((__m128i*)ps);
2166     cache_prefetch ((__m128i*)pd);
2167     cache_prefetch ((__m128i*)pm);
2168
2169     while (w >= 4)
2170     {
2171         /* fill cache line with next memory */
2172         cache_prefetch_next ((__m128i*)ps);
2173         cache_prefetch_next ((__m128i*)pd);
2174         cache_prefetch_next ((__m128i*)pm);
2175
2176         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2177         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2178         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2179
2180         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2181         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2182         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2183
2184         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2185                             &xmm_alpha_lo, &xmm_alpha_hi);
2186
2187         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2188                             &xmm_alpha_lo, &xmm_alpha_hi,
2189                             &xmm_mask_lo, &xmm_mask_hi);
2190
2191         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2192                       &xmm_mask_lo, &xmm_mask_hi);
2193
2194         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2195                             &xmm_mask_lo, &xmm_mask_hi,
2196                             &xmm_dst_lo, &xmm_dst_hi);
2197
2198         save_128_aligned (
2199             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2200
2201         ps += 4;
2202         pd += 4;
2203         pm += 4;
2204         w -= 4;
2205     }
2206
2207     while (w)
2208     {
2209         s = *ps++;
2210         m = *pm++;
2211         d = *pd;
2212
2213         *pd++ = pack_1x64_32 (
2214             pix_multiply_1x64 (
2215                 unpack_32_1x64 (d),
2216                 negate_1x64 (pix_multiply_1x64 (
2217                                  unpack_32_1x64 (m),
2218                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2219         w--;
2220     }
2221 }
2222
2223 static force_inline uint32_t
2224 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2225                                  uint32_t mask,
2226                                  uint32_t dst)
2227 {
2228     __m64 m = unpack_32_1x64 (mask);
2229     __m64 s = unpack_32_1x64 (src);
2230     __m64 d = unpack_32_1x64 (dst);
2231     __m64 sa = expand_alpha_1x64 (s);
2232     __m64 da = expand_alpha_1x64 (d);
2233
2234     s = pix_multiply_1x64 (s, m);
2235     m = negate_1x64 (pix_multiply_1x64 (m, sa));
2236
2237     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2238 }
2239
2240 static force_inline void
2241 core_combine_atop_ca_sse2 (uint32_t *      pd,
2242                            const uint32_t *ps,
2243                            const uint32_t *pm,
2244                            int             w)
2245 {
2246     uint32_t s, m, d;
2247
2248     __m128i xmm_src_lo, xmm_src_hi;
2249     __m128i xmm_dst_lo, xmm_dst_hi;
2250     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2251     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2252     __m128i xmm_mask_lo, xmm_mask_hi;
2253
2254     /* call prefetch hint to optimize cache load*/
2255     cache_prefetch ((__m128i*)ps);
2256     cache_prefetch ((__m128i*)pd);
2257     cache_prefetch ((__m128i*)pm);
2258
2259     while (w && (unsigned long)pd & 15)
2260     {
2261         s = *ps++;
2262         m = *pm++;
2263         d = *pd;
2264
2265         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2266         w--;
2267     }
2268
2269     /* call prefetch hint to optimize cache load*/
2270     cache_prefetch ((__m128i*)ps);
2271     cache_prefetch ((__m128i*)pd);
2272     cache_prefetch ((__m128i*)pm);
2273
2274     while (w >= 4)
2275     {
2276         /* fill cache line with next memory */
2277         cache_prefetch_next ((__m128i*)ps);
2278         cache_prefetch_next ((__m128i*)pd);
2279         cache_prefetch_next ((__m128i*)pm);
2280
2281         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2282         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2283         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2284
2285         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2286         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2287         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2288
2289         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2290                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2291         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2292                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2293
2294         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2295                             &xmm_mask_lo, &xmm_mask_hi,
2296                             &xmm_src_lo, &xmm_src_hi);
2297         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2298                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2299                             &xmm_mask_lo, &xmm_mask_hi);
2300
2301         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2302
2303         pix_add_multiply_2x128 (
2304             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2305             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2306             &xmm_dst_lo, &xmm_dst_hi);
2307
2308         save_128_aligned (
2309             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2310
2311         ps += 4;
2312         pd += 4;
2313         pm += 4;
2314         w -= 4;
2315     }
2316
2317     while (w)
2318     {
2319         s = *ps++;
2320         m = *pm++;
2321         d = *pd;
2322
2323         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2324         w--;
2325     }
2326 }
2327
2328 static force_inline uint32_t
2329 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2330                                          uint32_t mask,
2331                                          uint32_t dst)
2332 {
2333     __m64 m = unpack_32_1x64 (mask);
2334     __m64 s = unpack_32_1x64 (src);
2335     __m64 d = unpack_32_1x64 (dst);
2336
2337     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2338     __m64 sa = expand_alpha_1x64 (s);
2339
2340     s = pix_multiply_1x64 (s, m);
2341     m = pix_multiply_1x64 (m, sa);
2342
2343     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2344 }
2345
2346 static force_inline void
2347 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2348                                    const uint32_t *ps,
2349                                    const uint32_t *pm,
2350                                    int             w)
2351 {
2352     uint32_t s, m, d;
2353
2354     __m128i xmm_src_lo, xmm_src_hi;
2355     __m128i xmm_dst_lo, xmm_dst_hi;
2356     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2357     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2358     __m128i xmm_mask_lo, xmm_mask_hi;
2359
2360     /* call prefetch hint to optimize cache load*/
2361     cache_prefetch ((__m128i*)ps);
2362     cache_prefetch ((__m128i*)pd);
2363     cache_prefetch ((__m128i*)pm);
2364
2365     while (w && (unsigned long)pd & 15)
2366     {
2367         s = *ps++;
2368         m = *pm++;
2369         d = *pd;
2370
2371         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2372         w--;
2373     }
2374
2375     /* call prefetch hint to optimize cache load*/
2376     cache_prefetch ((__m128i*)ps);
2377     cache_prefetch ((__m128i*)pd);
2378     cache_prefetch ((__m128i*)pm);
2379
2380     while (w >= 4)
2381     {
2382         /* fill cache line with next memory */
2383         cache_prefetch_next ((__m128i*)ps);
2384         cache_prefetch_next ((__m128i*)pd);
2385         cache_prefetch_next ((__m128i*)pm);
2386
2387         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2388         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2389         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2390
2391         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2392         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2393         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2394
2395         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2396                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2397         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2398                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2399
2400         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2401                             &xmm_mask_lo, &xmm_mask_hi,
2402                             &xmm_src_lo, &xmm_src_hi);
2403         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2404                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2405                             &xmm_mask_lo, &xmm_mask_hi);
2406
2407         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2408                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2409
2410         pix_add_multiply_2x128 (
2411             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2412             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2413             &xmm_dst_lo, &xmm_dst_hi);
2414
2415         save_128_aligned (
2416             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2417
2418         ps += 4;
2419         pd += 4;
2420         pm += 4;
2421         w -= 4;
2422     }
2423
2424     while (w)
2425     {
2426         s = *ps++;
2427         m = *pm++;
2428         d = *pd;
2429
2430         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2431         w--;
2432     }
2433 }
2434
2435 static force_inline uint32_t
2436 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2437                                 uint32_t mask,
2438                                 uint32_t dst)
2439 {
2440     __m64 a = unpack_32_1x64 (mask);
2441     __m64 s = unpack_32_1x64 (src);
2442     __m64 d = unpack_32_1x64 (dst);
2443
2444     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2445                                        a, expand_alpha_1x64 (s)));
2446     __m64 dest      = pix_multiply_1x64 (s, a);
2447     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2448
2449     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2450                                                 &alpha_dst,
2451                                                 &dest,
2452                                                 &alpha_src));
2453 }
2454
2455 static force_inline void
2456 core_combine_xor_ca_sse2 (uint32_t *      pd,
2457                           const uint32_t *ps,
2458                           const uint32_t *pm,
2459                           int             w)
2460 {
2461     uint32_t s, m, d;
2462
2463     __m128i xmm_src_lo, xmm_src_hi;
2464     __m128i xmm_dst_lo, xmm_dst_hi;
2465     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2466     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2467     __m128i xmm_mask_lo, xmm_mask_hi;
2468
2469     /* call prefetch hint to optimize cache load*/
2470     cache_prefetch ((__m128i*)ps);
2471     cache_prefetch ((__m128i*)pd);
2472     cache_prefetch ((__m128i*)pm);
2473
2474     while (w && (unsigned long)pd & 15)
2475     {
2476         s = *ps++;
2477         m = *pm++;
2478         d = *pd;
2479
2480         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2481         w--;
2482     }
2483
2484     /* call prefetch hint to optimize cache load*/
2485     cache_prefetch ((__m128i*)ps);
2486     cache_prefetch ((__m128i*)pd);
2487     cache_prefetch ((__m128i*)pm);
2488
2489     while (w >= 4)
2490     {
2491         /* fill cache line with next memory */
2492         cache_prefetch_next ((__m128i*)ps);
2493         cache_prefetch_next ((__m128i*)pd);
2494         cache_prefetch_next ((__m128i*)pm);
2495
2496         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2497         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2498         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2499
2500         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2501         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2502         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2503
2504         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2505                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2506         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2507                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2508
2509         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2510                             &xmm_mask_lo, &xmm_mask_hi,
2511                             &xmm_src_lo, &xmm_src_hi);
2512         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2513                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2514                             &xmm_mask_lo, &xmm_mask_hi);
2515
2516         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2517                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2518         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2519                       &xmm_mask_lo, &xmm_mask_hi);
2520
2521         pix_add_multiply_2x128 (
2522             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2523             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2524             &xmm_dst_lo, &xmm_dst_hi);
2525
2526         save_128_aligned (
2527             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2528
2529         ps += 4;
2530         pd += 4;
2531         pm += 4;
2532         w -= 4;
2533     }
2534
2535     while (w)
2536     {
2537         s = *ps++;
2538         m = *pm++;
2539         d = *pd;
2540
2541         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2542         w--;
2543     }
2544 }
2545
2546 static force_inline void
2547 core_combine_add_ca_sse2 (uint32_t *      pd,
2548                           const uint32_t *ps,
2549                           const uint32_t *pm,
2550                           int             w)
2551 {
2552     uint32_t s, m, d;
2553
2554     __m128i xmm_src_lo, xmm_src_hi;
2555     __m128i xmm_dst_lo, xmm_dst_hi;
2556     __m128i xmm_mask_lo, xmm_mask_hi;
2557
2558     /* call prefetch hint to optimize cache load*/
2559     cache_prefetch ((__m128i*)ps);
2560     cache_prefetch ((__m128i*)pd);
2561     cache_prefetch ((__m128i*)pm);
2562
2563     while (w && (unsigned long)pd & 15)
2564     {
2565         s = *ps++;
2566         m = *pm++;
2567         d = *pd;
2568
2569         *pd++ = pack_1x64_32 (
2570             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2571                                              unpack_32_1x64 (m)),
2572                           unpack_32_1x64 (d)));
2573         w--;
2574     }
2575
2576     /* call prefetch hint to optimize cache load*/
2577     cache_prefetch ((__m128i*)ps);
2578     cache_prefetch ((__m128i*)pd);
2579     cache_prefetch ((__m128i*)pm);
2580
2581     while (w >= 4)
2582     {
2583         /* fill cache line with next memory */
2584         cache_prefetch_next ((__m128i*)ps);
2585         cache_prefetch_next ((__m128i*)pd);
2586         cache_prefetch_next ((__m128i*)pm);
2587
2588         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2589         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2590         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2591
2592         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2593         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2594         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2595
2596         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2597                             &xmm_mask_lo, &xmm_mask_hi,
2598                             &xmm_src_lo, &xmm_src_hi);
2599
2600         save_128_aligned (
2601             (__m128i*)pd, pack_2x128_128 (
2602                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2603                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2604
2605         ps += 4;
2606         pd += 4;
2607         pm += 4;
2608         w -= 4;
2609     }
2610
2611     while (w)
2612     {
2613         s = *ps++;
2614         m = *pm++;
2615         d = *pd;
2616
2617         *pd++ = pack_1x64_32 (
2618             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2619                                              unpack_32_1x64 (m)),
2620                           unpack_32_1x64 (d)));
2621         w--;
2622     }
2623 }
2624
2625 /* ---------------------------------------------------
2626  * fb_compose_setup_sSE2
2627  */
2628 static force_inline __m64
2629 create_mask_16_64 (uint16_t mask)
2630 {
2631     return _mm_set1_pi16 (mask);
2632 }
2633
2634 static force_inline __m128i
2635 create_mask_16_128 (uint16_t mask)
2636 {
2637     return _mm_set1_epi16 (mask);
2638 }
2639
2640 static force_inline __m64
2641 create_mask_2x32_64 (uint32_t mask0,
2642                      uint32_t mask1)
2643 {
2644     return _mm_set_pi32 (mask0, mask1);
2645 }
2646
2647 /* Work around a code generation bug in Sun Studio 12. */
2648 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2649 # define create_mask_2x32_128(mask0, mask1) \
2650         (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2651 #else
2652 static force_inline __m128i
2653 create_mask_2x32_128 (uint32_t mask0,
2654                       uint32_t mask1)
2655 {
2656     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2657 }
2658 #endif
2659
2660 /* SSE2 code patch for fbcompose.c */
2661
2662 static void
2663 sse2_combine_over_u (pixman_implementation_t *imp,
2664                      pixman_op_t              op,
2665                      uint32_t *               dst,
2666                      const uint32_t *         src,
2667                      const uint32_t *         mask,
2668                      int                      width)
2669 {
2670     core_combine_over_u_sse2 (dst, src, mask, width);
2671     _mm_empty ();
2672 }
2673
2674 static void
2675 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2676                              pixman_op_t              op,
2677                              uint32_t *               dst,
2678                              const uint32_t *         src,
2679                              const uint32_t *         mask,
2680                              int                      width)
2681 {
2682     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2683     _mm_empty ();
2684 }
2685
2686 static void
2687 sse2_combine_in_u (pixman_implementation_t *imp,
2688                    pixman_op_t              op,
2689                    uint32_t *               dst,
2690                    const uint32_t *         src,
2691                    const uint32_t *         mask,
2692                    int                      width)
2693 {
2694     core_combine_in_u_sse2 (dst, src, mask, width);
2695     _mm_empty ();
2696 }
2697
2698 static void
2699 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2700                            pixman_op_t              op,
2701                            uint32_t *               dst,
2702                            const uint32_t *         src,
2703                            const uint32_t *         mask,
2704                            int                      width)
2705 {
2706     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2707     _mm_empty ();
2708 }
2709
2710 static void
2711 sse2_combine_out_u (pixman_implementation_t *imp,
2712                     pixman_op_t              op,
2713                     uint32_t *               dst,
2714                     const uint32_t *         src,
2715                     const uint32_t *         mask,
2716                     int                      width)
2717 {
2718     core_combine_out_u_sse2 (dst, src, mask, width);
2719     _mm_empty ();
2720 }
2721
2722 static void
2723 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2724                             pixman_op_t              op,
2725                             uint32_t *               dst,
2726                             const uint32_t *         src,
2727                             const uint32_t *         mask,
2728                             int                      width)
2729 {
2730     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2731     _mm_empty ();
2732 }
2733
2734 static void
2735 sse2_combine_atop_u (pixman_implementation_t *imp,
2736                      pixman_op_t              op,
2737                      uint32_t *               dst,
2738                      const uint32_t *         src,
2739                      const uint32_t *         mask,
2740                      int                      width)
2741 {
2742     core_combine_atop_u_sse2 (dst, src, mask, width);
2743     _mm_empty ();
2744 }
2745
2746 static void
2747 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2748                              pixman_op_t              op,
2749                              uint32_t *               dst,
2750                              const uint32_t *         src,
2751                              const uint32_t *         mask,
2752                              int                      width)
2753 {
2754     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2755     _mm_empty ();
2756 }
2757
2758 static void
2759 sse2_combine_xor_u (pixman_implementation_t *imp,
2760                     pixman_op_t              op,
2761                     uint32_t *               dst,
2762                     const uint32_t *         src,
2763                     const uint32_t *         mask,
2764                     int                      width)
2765 {
2766     core_combine_xor_u_sse2 (dst, src, mask, width);
2767     _mm_empty ();
2768 }
2769
2770 static void
2771 sse2_combine_add_u (pixman_implementation_t *imp,
2772                     pixman_op_t              op,
2773                     uint32_t *               dst,
2774                     const uint32_t *         src,
2775                     const uint32_t *         mask,
2776                     int                      width)
2777 {
2778     core_combine_add_u_sse2 (dst, src, mask, width);
2779     _mm_empty ();
2780 }
2781
2782 static void
2783 sse2_combine_saturate_u (pixman_implementation_t *imp,
2784                          pixman_op_t              op,
2785                          uint32_t *               dst,
2786                          const uint32_t *         src,
2787                          const uint32_t *         mask,
2788                          int                      width)
2789 {
2790     core_combine_saturate_u_sse2 (dst, src, mask, width);
2791     _mm_empty ();
2792 }
2793
2794 static void
2795 sse2_combine_src_ca (pixman_implementation_t *imp,
2796                      pixman_op_t              op,
2797                      uint32_t *               dst,
2798                      const uint32_t *         src,
2799                      const uint32_t *         mask,
2800                      int                      width)
2801 {
2802     core_combine_src_ca_sse2 (dst, src, mask, width);
2803     _mm_empty ();
2804 }
2805
2806 static void
2807 sse2_combine_over_ca (pixman_implementation_t *imp,
2808                       pixman_op_t              op,
2809                       uint32_t *               dst,
2810                       const uint32_t *         src,
2811                       const uint32_t *         mask,
2812                       int                      width)
2813 {
2814     core_combine_over_ca_sse2 (dst, src, mask, width);
2815     _mm_empty ();
2816 }
2817
2818 static void
2819 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2820                               pixman_op_t              op,
2821                               uint32_t *               dst,
2822                               const uint32_t *         src,
2823                               const uint32_t *         mask,
2824                               int                      width)
2825 {
2826     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2827     _mm_empty ();
2828 }
2829
2830 static void
2831 sse2_combine_in_ca (pixman_implementation_t *imp,
2832                     pixman_op_t              op,
2833                     uint32_t *               dst,
2834                     const uint32_t *         src,
2835                     const uint32_t *         mask,
2836                     int                      width)
2837 {
2838     core_combine_in_ca_sse2 (dst, src, mask, width);
2839     _mm_empty ();
2840 }
2841
2842 static void
2843 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2844                             pixman_op_t              op,
2845                             uint32_t *               dst,
2846                             const uint32_t *         src,
2847                             const uint32_t *         mask,
2848                             int                      width)
2849 {
2850     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2851     _mm_empty ();
2852 }
2853
2854 static void
2855 sse2_combine_out_ca (pixman_implementation_t *imp,
2856                      pixman_op_t              op,
2857                      uint32_t *               dst,
2858                      const uint32_t *         src,
2859                      const uint32_t *         mask,
2860                      int                      width)
2861 {
2862     core_combine_out_ca_sse2 (dst, src, mask, width);
2863     _mm_empty ();
2864 }
2865
2866 static void
2867 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2868                              pixman_op_t              op,
2869                              uint32_t *               dst,
2870                              const uint32_t *         src,
2871                              const uint32_t *         mask,
2872                              int                      width)
2873 {
2874     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2875     _mm_empty ();
2876 }
2877
2878 static void
2879 sse2_combine_atop_ca (pixman_implementation_t *imp,
2880                       pixman_op_t              op,
2881                       uint32_t *               dst,
2882                       const uint32_t *         src,
2883                       const uint32_t *         mask,
2884                       int                      width)
2885 {
2886     core_combine_atop_ca_sse2 (dst, src, mask, width);
2887     _mm_empty ();
2888 }
2889
2890 static void
2891 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2892                               pixman_op_t              op,
2893                               uint32_t *               dst,
2894                               const uint32_t *         src,
2895                               const uint32_t *         mask,
2896                               int                      width)
2897 {
2898     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2899     _mm_empty ();
2900 }
2901
2902 static void
2903 sse2_combine_xor_ca (pixman_implementation_t *imp,
2904                      pixman_op_t              op,
2905                      uint32_t *               dst,
2906                      const uint32_t *         src,
2907                      const uint32_t *         mask,
2908                      int                      width)
2909 {
2910     core_combine_xor_ca_sse2 (dst, src, mask, width);
2911     _mm_empty ();
2912 }
2913
2914 static void
2915 sse2_combine_add_ca (pixman_implementation_t *imp,
2916                      pixman_op_t              op,
2917                      uint32_t *               dst,
2918                      const uint32_t *         src,
2919                      const uint32_t *         mask,
2920                      int                      width)
2921 {
2922     core_combine_add_ca_sse2 (dst, src, mask, width);
2923     _mm_empty ();
2924 }
2925
2926 /* -------------------------------------------------------------------
2927  * composite_over_n_8888
2928  */
2929
2930 static void
2931 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2932                             pixman_op_t              op,
2933                             pixman_image_t *         src_image,
2934                             pixman_image_t *         mask_image,
2935                             pixman_image_t *         dst_image,
2936                             int32_t                  src_x,
2937                             int32_t                  src_y,
2938                             int32_t                  mask_x,
2939                             int32_t                  mask_y,
2940                             int32_t                  dest_x,
2941                             int32_t                  dest_y,
2942                             int32_t                  width,
2943                             int32_t                  height)
2944 {
2945     uint32_t src;
2946     uint32_t    *dst_line, *dst, d;
2947     uint16_t w;
2948     int dst_stride;
2949     __m128i xmm_src, xmm_alpha;
2950     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2951
2952     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2953
2954     if (src == 0)
2955         return;
2956
2957     PIXMAN_IMAGE_GET_LINE (
2958         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2959
2960     xmm_src = expand_pixel_32_1x128 (src);
2961     xmm_alpha = expand_alpha_1x128 (xmm_src);
2962
2963     while (height--)
2964     {
2965         dst = dst_line;
2966
2967         /* call prefetch hint to optimize cache load*/
2968         cache_prefetch ((__m128i*)dst);
2969
2970         dst_line += dst_stride;
2971         w = width;
2972
2973         while (w && (unsigned long)dst & 15)
2974         {
2975             d = *dst;
2976             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2977                                               _mm_movepi64_pi64 (xmm_alpha),
2978                                               unpack_32_1x64 (d)));
2979             w--;
2980         }
2981
2982         cache_prefetch ((__m128i*)dst);
2983
2984         while (w >= 4)
2985         {
2986             /* fill cache line with next memory */
2987             cache_prefetch_next ((__m128i*)dst);
2988
2989             xmm_dst = load_128_aligned ((__m128i*)dst);
2990
2991             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2992
2993             over_2x128 (&xmm_src, &xmm_src,
2994                         &xmm_alpha, &xmm_alpha,
2995                         &xmm_dst_lo, &xmm_dst_hi);
2996
2997             /* rebuid the 4 pixel data and save*/
2998             save_128_aligned (
2999                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3000
3001             w -= 4;
3002             dst += 4;
3003         }
3004
3005         while (w)
3006         {
3007             d = *dst;
3008             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3009                                               _mm_movepi64_pi64 (xmm_alpha),
3010                                               unpack_32_1x64 (d)));
3011             w--;
3012         }
3013
3014     }
3015     _mm_empty ();
3016 }
3017
3018 /* ---------------------------------------------------------------------
3019  * composite_over_n_0565
3020  */
3021 static void
3022 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3023                             pixman_op_t              op,
3024                             pixman_image_t *         src_image,
3025                             pixman_image_t *         mask_image,
3026                             pixman_image_t *         dst_image,
3027                             int32_t                  src_x,
3028                             int32_t                  src_y,
3029                             int32_t                  mask_x,
3030                             int32_t                  mask_y,
3031                             int32_t                  dest_x,
3032                             int32_t                  dest_y,
3033                             int32_t                  width,
3034                             int32_t                  height)
3035 {
3036     uint32_t src;
3037     uint16_t    *dst_line, *dst, d;
3038     uint16_t w;
3039     int dst_stride;
3040     __m128i xmm_src, xmm_alpha;
3041     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3042
3043     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3044
3045     if (src == 0)
3046         return;
3047
3048     PIXMAN_IMAGE_GET_LINE (
3049         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3050
3051     xmm_src = expand_pixel_32_1x128 (src);
3052     xmm_alpha = expand_alpha_1x128 (xmm_src);
3053
3054     while (height--)
3055     {
3056         dst = dst_line;
3057
3058         /* call prefetch hint to optimize cache load*/
3059         cache_prefetch ((__m128i*)dst);
3060
3061         dst_line += dst_stride;
3062         w = width;
3063
3064         while (w && (unsigned long)dst & 15)
3065         {
3066             d = *dst;
3067
3068             *dst++ = pack_565_32_16 (
3069                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3070                                          _mm_movepi64_pi64 (xmm_alpha),
3071                                          expand565_16_1x64 (d))));
3072             w--;
3073         }
3074
3075         /* call prefetch hint to optimize cache load*/
3076         cache_prefetch ((__m128i*)dst);
3077
3078         while (w >= 8)
3079         {
3080             /* fill cache line with next memory */
3081             cache_prefetch_next ((__m128i*)dst);
3082
3083             xmm_dst = load_128_aligned ((__m128i*)dst);
3084
3085             unpack_565_128_4x128 (xmm_dst,
3086                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3087
3088             over_2x128 (&xmm_src, &xmm_src,
3089                         &xmm_alpha, &xmm_alpha,
3090                         &xmm_dst0, &xmm_dst1);
3091             over_2x128 (&xmm_src, &xmm_src,
3092                         &xmm_alpha, &xmm_alpha,
3093                         &xmm_dst2, &xmm_dst3);
3094
3095             xmm_dst = pack_565_4x128_128 (
3096                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3097
3098             save_128_aligned ((__m128i*)dst, xmm_dst);
3099
3100             dst += 8;
3101             w -= 8;
3102         }
3103
3104         while (w--)
3105         {
3106             d = *dst;
3107             *dst++ = pack_565_32_16 (
3108                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3109                                          _mm_movepi64_pi64 (xmm_alpha),
3110                                          expand565_16_1x64 (d))));
3111         }
3112     }
3113
3114     _mm_empty ();
3115 }
3116
3117 /* ------------------------------
3118  * composite_add_n_8888_8888_ca
3119  */
3120 static void
3121 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3122                                    pixman_op_t              op,
3123                                    pixman_image_t *         src_image,
3124                                    pixman_image_t *         mask_image,
3125                                    pixman_image_t *         dst_image,
3126                                    int32_t                  src_x,
3127                                    int32_t                  src_y,
3128                                    int32_t                  mask_x,
3129                                    int32_t                  mask_y,
3130                                    int32_t                  dest_x,
3131                                    int32_t                  dest_y,
3132                                    int32_t                  width,
3133                                    int32_t                  height)
3134 {
3135     uint32_t src, srca;
3136     uint32_t    *dst_line, d;
3137     uint32_t    *mask_line, m;
3138     uint32_t pack_cmp;
3139     int dst_stride, mask_stride;
3140
3141     __m128i xmm_src, xmm_alpha;
3142     __m128i xmm_dst;
3143     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3144
3145     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3146
3147     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3148     srca = src >> 24;
3149     
3150     if (src == 0)
3151         return;
3152
3153     PIXMAN_IMAGE_GET_LINE (
3154         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3155     PIXMAN_IMAGE_GET_LINE (
3156         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3157
3158     xmm_src = _mm_unpacklo_epi8 (
3159         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3160     xmm_alpha = expand_alpha_1x128 (xmm_src);
3161     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3162     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3163
3164     while (height--)
3165     {
3166         int w = width;
3167         const uint32_t *pm = (uint32_t *)mask_line;
3168         uint32_t *pd = (uint32_t *)dst_line;
3169
3170         dst_line += dst_stride;
3171         mask_line += mask_stride;
3172
3173         /* call prefetch hint to optimize cache load*/
3174         cache_prefetch ((__m128i*)pd);
3175         cache_prefetch ((__m128i*)pm);
3176
3177         while (w && (unsigned long)pd & 15)
3178         {
3179             m = *pm++;
3180
3181             if (m)
3182             {
3183                 d = *pd;
3184                 
3185                 mmx_mask = unpack_32_1x64 (m);
3186                 mmx_dest = unpack_32_1x64 (d);
3187
3188                 *pd = pack_1x64_32 (
3189                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3190             }
3191
3192             pd++;
3193             w--;
3194         }
3195
3196         /* call prefetch hint to optimize cache load*/
3197         cache_prefetch ((__m128i*)pd);
3198         cache_prefetch ((__m128i*)pm);
3199
3200         while (w >= 4)
3201         {
3202             /* fill cache line with next memory */
3203             cache_prefetch_next ((__m128i*)pd);
3204             cache_prefetch_next ((__m128i*)pm);
3205
3206             xmm_mask = load_128_unaligned ((__m128i*)pm);
3207
3208             pack_cmp =
3209                 _mm_movemask_epi8 (
3210                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3211
3212             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3213             if (pack_cmp != 0xffff)
3214             {
3215                 xmm_dst = load_128_aligned ((__m128i*)pd);
3216
3217                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3218
3219                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3220                                     &xmm_mask_lo, &xmm_mask_hi,
3221                                     &xmm_mask_lo, &xmm_mask_hi);
3222                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3223                 
3224                 save_128_aligned (
3225                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3226             }
3227
3228             pd += 4;
3229             pm += 4;
3230             w -= 4;
3231         }
3232
3233         while (w)
3234         {
3235             m = *pm++;
3236
3237             if (m)
3238             {
3239                 d = *pd;
3240                 
3241                 mmx_mask = unpack_32_1x64 (m);
3242                 mmx_dest = unpack_32_1x64 (d);
3243
3244                 *pd = pack_1x64_32 (
3245                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3246             }
3247
3248             pd++;
3249             w--;
3250         }
3251     }
3252
3253     _mm_empty ();
3254 }
3255
3256 /* ---------------------------------------------------------------------------
3257  * composite_over_n_8888_8888_ca
3258  */
3259
3260 static void
3261 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3262                                     pixman_op_t              op,
3263                                     pixman_image_t *         src_image,
3264                                     pixman_image_t *         mask_image,
3265                                     pixman_image_t *         dst_image,
3266                                     int32_t                  src_x,
3267                                     int32_t                  src_y,
3268                                     int32_t                  mask_x,
3269                                     int32_t                  mask_y,
3270                                     int32_t                  dest_x,
3271                                     int32_t                  dest_y,
3272                                     int32_t                  width,
3273                                     int32_t                  height)
3274 {
3275     uint32_t src;
3276     uint32_t    *dst_line, d;
3277     uint32_t    *mask_line, m;
3278     uint32_t pack_cmp;
3279     int dst_stride, mask_stride;
3280
3281     __m128i xmm_src, xmm_alpha;
3282     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3283     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3284
3285     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3286
3287     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3288
3289     if (src == 0)
3290         return;
3291
3292     PIXMAN_IMAGE_GET_LINE (
3293         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3294     PIXMAN_IMAGE_GET_LINE (
3295         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3296
3297     xmm_src = _mm_unpacklo_epi8 (
3298         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3299     xmm_alpha = expand_alpha_1x128 (xmm_src);
3300     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3301     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3302
3303     while (height--)
3304     {
3305         int w = width;
3306         const uint32_t *pm = (uint32_t *)mask_line;
3307         uint32_t *pd = (uint32_t *)dst_line;
3308
3309         dst_line += dst_stride;
3310         mask_line += mask_stride;
3311
3312         /* call prefetch hint to optimize cache load*/
3313         cache_prefetch ((__m128i*)pd);
3314         cache_prefetch ((__m128i*)pm);
3315
3316         while (w && (unsigned long)pd & 15)
3317         {
3318             m = *pm++;
3319
3320             if (m)
3321             {
3322                 d = *pd;
3323                 mmx_mask = unpack_32_1x64 (m);
3324                 mmx_dest = unpack_32_1x64 (d);
3325
3326                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3327                                                   &mmx_alpha,
3328                                                   &mmx_mask,
3329                                                   &mmx_dest));
3330             }
3331
3332             pd++;
3333             w--;
3334         }
3335
3336         /* call prefetch hint to optimize cache load*/
3337         cache_prefetch ((__m128i*)pd);
3338         cache_prefetch ((__m128i*)pm);
3339
3340         while (w >= 4)
3341         {
3342             /* fill cache line with next memory */
3343             cache_prefetch_next ((__m128i*)pd);
3344             cache_prefetch_next ((__m128i*)pm);
3345
3346             xmm_mask = load_128_unaligned ((__m128i*)pm);
3347
3348             pack_cmp =
3349                 _mm_movemask_epi8 (
3350                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3351
3352             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3353             if (pack_cmp != 0xffff)
3354             {
3355                 xmm_dst = load_128_aligned ((__m128i*)pd);
3356
3357                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3358                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3359
3360                 in_over_2x128 (&xmm_src, &xmm_src,
3361                                &xmm_alpha, &xmm_alpha,
3362                                &xmm_mask_lo, &xmm_mask_hi,
3363                                &xmm_dst_lo, &xmm_dst_hi);
3364
3365                 save_128_aligned (
3366                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3367             }
3368
3369             pd += 4;
3370             pm += 4;
3371             w -= 4;
3372         }
3373
3374         while (w)
3375         {
3376             m = *pm++;
3377
3378             if (m)
3379             {
3380                 d = *pd;
3381                 mmx_mask = unpack_32_1x64 (m);
3382                 mmx_dest = unpack_32_1x64 (d);
3383
3384                 *pd = pack_1x64_32 (
3385                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3386             }
3387
3388             pd++;
3389             w--;
3390         }
3391     }
3392
3393     _mm_empty ();
3394 }
3395
3396 /*---------------------------------------------------------------------
3397  * composite_over_8888_n_8888
3398  */
3399
3400 static void
3401 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3402                                  pixman_op_t              op,
3403                                  pixman_image_t *         src_image,
3404                                  pixman_image_t *         mask_image,
3405                                  pixman_image_t *         dst_image,
3406                                  int32_t                  src_x,
3407                                  int32_t                  src_y,
3408                                  int32_t                  mask_x,
3409                                  int32_t                  mask_y,
3410                                  int32_t                  dest_x,
3411                                  int32_t                  dest_y,
3412                                  int32_t                  width,
3413                                  int32_t                  height)
3414 {
3415     uint32_t    *dst_line, *dst;
3416     uint32_t    *src_line, *src;
3417     uint32_t mask;
3418     uint16_t w;
3419     int dst_stride, src_stride;
3420
3421     __m128i xmm_mask;
3422     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3423     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3424     __m128i xmm_alpha_lo, xmm_alpha_hi;
3425
3426     PIXMAN_IMAGE_GET_LINE (
3427         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3428     PIXMAN_IMAGE_GET_LINE (
3429         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3430
3431     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3432
3433     xmm_mask = create_mask_16_128 (mask >> 24);
3434
3435     while (height--)
3436     {
3437         dst = dst_line;
3438         dst_line += dst_stride;
3439         src = src_line;
3440         src_line += src_stride;
3441         w = width;
3442
3443         /* call prefetch hint to optimize cache load*/
3444         cache_prefetch ((__m128i*)dst);
3445         cache_prefetch ((__m128i*)src);
3446
3447         while (w && (unsigned long)dst & 15)
3448         {
3449             uint32_t s = *src++;
3450             uint32_t d = *dst;
3451
3452             __m64 ms = unpack_32_1x64 (s);
3453             __m64 alpha    = expand_alpha_1x64 (ms);
3454             __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3455             __m64 alpha_dst = unpack_32_1x64 (d);
3456
3457             *dst++ = pack_1x64_32 (
3458                 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3459
3460             w--;
3461         }
3462
3463         /* call prefetch hint to optimize cache load*/
3464         cache_prefetch ((__m128i*)dst);
3465         cache_prefetch ((__m128i*)src);
3466
3467         while (w >= 4)
3468         {
3469             /* fill cache line with next memory */
3470             cache_prefetch_next ((__m128i*)dst);
3471             cache_prefetch_next ((__m128i*)src);
3472
3473             xmm_src = load_128_unaligned ((__m128i*)src);
3474             xmm_dst = load_128_aligned ((__m128i*)dst);
3475
3476             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3477             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3478             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3479                                 &xmm_alpha_lo, &xmm_alpha_hi);
3480
3481             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3482                            &xmm_alpha_lo, &xmm_alpha_hi,
3483                            &xmm_mask, &xmm_mask,
3484                            &xmm_dst_lo, &xmm_dst_hi);
3485
3486             save_128_aligned (
3487                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3488
3489             dst += 4;
3490             src += 4;
3491             w -= 4;
3492         }
3493
3494         while (w)
3495         {
3496             uint32_t s = *src++;
3497             uint32_t d = *dst;
3498
3499             __m64 ms = unpack_32_1x64 (s);
3500             __m64 alpha = expand_alpha_1x64 (ms);
3501             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3502             __m64 dest  = unpack_32_1x64 (d);
3503
3504             *dst++ = pack_1x64_32 (
3505                 in_over_1x64 (&ms, &alpha, &mask, &dest));
3506
3507             w--;
3508         }
3509     }
3510
3511     _mm_empty ();
3512 }
3513
3514 /* ---------------------------------------------------------------------
3515  * composite_over_x888_n_8888
3516  */
3517 static void
3518 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3519                                  pixman_op_t              op,
3520                                  pixman_image_t *         src_image,
3521                                  pixman_image_t *         mask_image,
3522                                  pixman_image_t *         dst_image,
3523                                  int32_t                  src_x,
3524                                  int32_t                  src_y,
3525                                  int32_t                  mask_x,
3526                                  int32_t                  mask_y,
3527                                  int32_t                  dest_x,
3528                                  int32_t                  dest_y,
3529                                  int32_t                  width,
3530                                  int32_t                  height)
3531 {
3532     uint32_t    *dst_line, *dst;
3533     uint32_t    *src_line, *src;
3534     uint32_t mask;
3535     int dst_stride, src_stride;
3536     uint16_t w;
3537
3538     __m128i xmm_mask, xmm_alpha;
3539     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3540     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3541
3542     PIXMAN_IMAGE_GET_LINE (
3543         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3544     PIXMAN_IMAGE_GET_LINE (
3545         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3546
3547     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3548
3549     xmm_mask = create_mask_16_128 (mask >> 24);
3550     xmm_alpha = mask_00ff;
3551
3552     while (height--)
3553     {
3554         dst = dst_line;
3555         dst_line += dst_stride;
3556         src = src_line;
3557         src_line += src_stride;
3558         w = width;
3559
3560         /* call prefetch hint to optimize cache load*/
3561         cache_prefetch ((__m128i*)dst);
3562         cache_prefetch ((__m128i*)src);
3563
3564         while (w && (unsigned long)dst & 15)
3565         {
3566             uint32_t s = (*src++) | 0xff000000;
3567             uint32_t d = *dst;
3568
3569             __m64 src   = unpack_32_1x64 (s);
3570             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3571             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3572             __m64 dest  = unpack_32_1x64 (d);
3573
3574             *dst++ = pack_1x64_32 (
3575                 in_over_1x64 (&src, &alpha, &mask, &dest));
3576
3577             w--;
3578         }
3579
3580         /* call prefetch hint to optimize cache load*/
3581         cache_prefetch ((__m128i*)dst);
3582         cache_prefetch ((__m128i*)src);
3583
3584         while (w >= 4)
3585         {
3586             /* fill cache line with next memory */
3587             cache_prefetch_next ((__m128i*)dst);
3588             cache_prefetch_next ((__m128i*)src);
3589
3590             xmm_src = _mm_or_si128 (
3591                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3592             xmm_dst = load_128_aligned ((__m128i*)dst);
3593
3594             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3595             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3596
3597             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3598                            &xmm_alpha, &xmm_alpha,
3599                            &xmm_mask, &xmm_mask,
3600                            &xmm_dst_lo, &xmm_dst_hi);
3601
3602             save_128_aligned (
3603                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3604
3605             dst += 4;
3606             src += 4;
3607             w -= 4;
3608
3609         }
3610
3611         while (w)
3612         {
3613             uint32_t s = (*src++) | 0xff000000;
3614             uint32_t d = *dst;
3615
3616             __m64 src  = unpack_32_1x64 (s);
3617             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3618             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3619             __m64 dest  = unpack_32_1x64 (d);
3620
3621             *dst++ = pack_1x64_32 (
3622                 in_over_1x64 (&src, &alpha, &mask, &dest));
3623
3624             w--;
3625         }
3626     }
3627
3628     _mm_empty ();
3629 }
3630
3631 /* --------------------------------------------------------------------
3632  * composite_over_8888_8888
3633  */
3634 static void
3635 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3636                                pixman_op_t              op,
3637                                pixman_image_t *         src_image,
3638                                pixman_image_t *         mask_image,
3639                                pixman_image_t *         dst_image,
3640                                int32_t                  src_x,
3641                                int32_t                  src_y,
3642                                int32_t                  mask_x,
3643                                int32_t                  mask_y,
3644                                int32_t                  dest_x,
3645                                int32_t                  dest_y,
3646                                int32_t                  width,
3647                                int32_t                  height)
3648 {
3649     int dst_stride, src_stride;
3650     uint32_t    *dst_line, *dst;
3651     uint32_t    *src_line, *src;
3652
3653     PIXMAN_IMAGE_GET_LINE (
3654         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3655     PIXMAN_IMAGE_GET_LINE (
3656         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3657
3658     dst = dst_line;
3659     src = src_line;
3660
3661     while (height--)
3662     {
3663         core_combine_over_u_sse2 (dst, src, NULL, width);
3664
3665         dst += dst_stride;
3666         src += src_stride;
3667     }
3668     _mm_empty ();
3669 }
3670
3671 /* ------------------------------------------------------------------
3672  * composite_over_8888_0565
3673  */
3674 static force_inline uint16_t
3675 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3676 {
3677     __m64 ms;
3678
3679     ms = unpack_32_1x64 (src);
3680     return pack_565_32_16 (
3681         pack_1x64_32 (
3682             over_1x64 (
3683                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3684 }
3685
3686 static void
3687 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3688                                pixman_op_t              op,
3689                                pixman_image_t *         src_image,
3690                                pixman_image_t *         mask_image,
3691                                pixman_image_t *         dst_image,
3692                                int32_t                  src_x,
3693                                int32_t                  src_y,
3694                                int32_t                  mask_x,
3695                                int32_t                  mask_y,
3696                                int32_t                  dest_x,
3697                                int32_t                  dest_y,
3698                                int32_t                  width,
3699                                int32_t                  height)
3700 {
3701     uint16_t    *dst_line, *dst, d;
3702     uint32_t    *src_line, *src, s;
3703     int dst_stride, src_stride;
3704     uint16_t w;
3705
3706     __m128i xmm_alpha_lo, xmm_alpha_hi;
3707     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3708     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3709
3710     PIXMAN_IMAGE_GET_LINE (
3711         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3712     PIXMAN_IMAGE_GET_LINE (
3713         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3714
3715 #if 0
3716     /* FIXME
3717      *
3718      * I copy the code from MMX one and keep the fixme.
3719      * If it's a problem there, probably is a problem here.
3720      */
3721     assert (src_image->drawable == mask_image->drawable);
3722 #endif
3723
3724     while (height--)
3725     {
3726         dst = dst_line;
3727         src = src_line;
3728
3729         /* call prefetch hint to optimize cache load*/
3730         cache_prefetch ((__m128i*)src);
3731         cache_prefetch ((__m128i*)dst);
3732
3733         dst_line += dst_stride;
3734         src_line += src_stride;
3735         w = width;
3736
3737         /* Align dst on a 16-byte boundary */
3738         while (w &&
3739                ((unsigned long)dst & 15))
3740         {
3741             s = *src++;
3742             d = *dst;
3743
3744             *dst++ = composite_over_8888_0565pixel (s, d);
3745             w--;
3746         }
3747
3748         /* call prefetch hint to optimize cache load*/
3749         cache_prefetch ((__m128i*)src);
3750         cache_prefetch ((__m128i*)dst);
3751
3752         /* It's a 8 pixel loop */
3753         while (w >= 8)
3754         {
3755             /* fill cache line with next memory */
3756             cache_prefetch_next ((__m128i*)src);
3757             cache_prefetch_next ((__m128i*)dst);
3758
3759             /* I'm loading unaligned because I'm not sure
3760              * about the address alignment.
3761              */
3762             xmm_src = load_128_unaligned ((__m128i*) src);
3763             xmm_dst = load_128_aligned ((__m128i*) dst);
3764
3765             /* Unpacking */
3766             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3767             unpack_565_128_4x128 (xmm_dst,
3768                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3769             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3770                                 &xmm_alpha_lo, &xmm_alpha_hi);
3771
3772             /* I'm loading next 4 pixels from memory
3773              * before to optimze the memory read.
3774              */
3775             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3776
3777             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3778                         &xmm_alpha_lo, &xmm_alpha_hi,
3779                         &xmm_dst0, &xmm_dst1);
3780
3781             /* Unpacking */
3782             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3783             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3784                                 &xmm_alpha_lo, &xmm_alpha_hi);
3785
3786             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3787                         &xmm_alpha_lo, &xmm_alpha_hi,
3788                         &xmm_dst2, &xmm_dst3);
3789
3790             save_128_aligned (
3791                 (__m128i*)dst, pack_565_4x128_128 (
3792                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3793
3794             w -= 8;
3795             dst += 8;
3796             src += 8;
3797         }
3798
3799         while (w--)
3800         {
3801             s = *src++;
3802             d = *dst;
3803
3804             *dst++ = composite_over_8888_0565pixel (s, d);
3805         }
3806     }
3807
3808     _mm_empty ();
3809 }
3810
3811 /* -----------------------------------------------------------------
3812  * composite_over_n_8_8888
3813  */
3814
3815 static void
3816 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3817                               pixman_op_t              op,
3818                               pixman_image_t *         src_image,
3819                               pixman_image_t *         mask_image,
3820                               pixman_image_t *         dst_image,
3821                               int32_t                  src_x,
3822                               int32_t                  src_y,
3823                               int32_t                  mask_x,
3824                               int32_t                  mask_y,
3825                               int32_t                  dest_x,
3826                               int32_t                  dest_y,
3827                               int32_t                  width,
3828                               int32_t                  height)
3829 {
3830     uint32_t src, srca;
3831     uint32_t *dst_line, *dst;
3832     uint8_t *mask_line, *mask;
3833     int dst_stride, mask_stride;
3834     uint16_t w;
3835     uint32_t m, d;
3836
3837     __m128i xmm_src, xmm_alpha, xmm_def;
3838     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3839     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3840
3841     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3842
3843     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3844
3845     srca = src >> 24;
3846     if (src == 0)
3847         return;
3848
3849     PIXMAN_IMAGE_GET_LINE (
3850         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3851     PIXMAN_IMAGE_GET_LINE (
3852         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3853
3854     xmm_def = create_mask_2x32_128 (src, src);
3855     xmm_src = expand_pixel_32_1x128 (src);
3856     xmm_alpha = expand_alpha_1x128 (xmm_src);
3857     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3858     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3859
3860     while (height--)
3861     {
3862         dst = dst_line;
3863         dst_line += dst_stride;
3864         mask = mask_line;
3865         mask_line += mask_stride;
3866         w = width;
3867
3868         /* call prefetch hint to optimize cache load*/
3869         cache_prefetch ((__m128i*)mask);
3870         cache_prefetch ((__m128i*)dst);
3871
3872         while (w && (unsigned long)dst & 15)
3873         {
3874             uint8_t m = *mask++;
3875
3876             if (m)
3877             {
3878                 d = *dst;
3879                 mmx_mask = expand_pixel_8_1x64 (m);
3880                 mmx_dest = unpack_32_1x64 (d);
3881
3882                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3883                                                    &mmx_alpha,
3884                                                    &mmx_mask,
3885                                                    &mmx_dest));
3886             }
3887
3888             w--;
3889             dst++;
3890         }
3891
3892         /* call prefetch hint to optimize cache load*/
3893         cache_prefetch ((__m128i*)mask);
3894         cache_prefetch ((__m128i*)dst);
3895
3896         while (w >= 4)
3897         {
3898             /* fill cache line with next memory */
3899             cache_prefetch_next ((__m128i*)mask);
3900             cache_prefetch_next ((__m128i*)dst);
3901
3902             m = *((uint32_t*)mask);
3903
3904             if (srca == 0xff && m == 0xffffffff)
3905             {
3906                 save_128_aligned ((__m128i*)dst, xmm_def);
3907             }
3908             else if (m)
3909             {
3910                 xmm_dst = load_128_aligned ((__m128i*) dst);
3911                 xmm_mask = unpack_32_1x128 (m);
3912                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3913
3914                 /* Unpacking */
3915                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3916                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3917
3918                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3919                                         &xmm_mask_lo, &xmm_mask_hi);
3920
3921                 in_over_2x128 (&xmm_src, &xmm_src,
3922                                &xmm_alpha, &xmm_alpha,
3923                                &xmm_mask_lo, &xmm_mask_hi,
3924                                &xmm_dst_lo, &xmm_dst_hi);
3925
3926                 save_128_aligned (
3927                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3928             }
3929
3930             w -= 4;
3931             dst += 4;
3932             mask += 4;
3933         }
3934
3935         while (w)
3936         {
3937             uint8_t m = *mask++;
3938
3939             if (m)
3940             {
3941                 d = *dst;
3942                 mmx_mask = expand_pixel_8_1x64 (m);
3943                 mmx_dest = unpack_32_1x64 (d);
3944
3945                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3946                                                    &mmx_alpha,
3947                                                    &mmx_mask,
3948                                                    &mmx_dest));
3949             }
3950
3951             w--;
3952             dst++;
3953         }
3954     }
3955
3956     _mm_empty ();
3957 }
3958
3959 /* ----------------------------------------------------------------
3960  * composite_over_n_8_8888
3961  */
3962
3963 pixman_bool_t
3964 pixman_fill_sse2 (uint32_t *bits,
3965                   int       stride,
3966                   int       bpp,
3967                   int       x,
3968                   int       y,
3969                   int       width,
3970                   int       height,
3971                   uint32_t  data)
3972 {
3973     uint32_t byte_width;
3974     uint8_t         *byte_line;
3975
3976     __m128i xmm_def;
3977
3978     if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3979         return FALSE;
3980
3981     if (bpp != 16 && bpp != 32)
3982         return FALSE;
3983
3984     if (bpp == 16)
3985     {
3986         stride = stride * (int) sizeof (uint32_t) / 2;
3987         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3988         byte_width = 2 * width;
3989         stride *= 2;
3990     }
3991     else
3992     {
3993         stride = stride * (int) sizeof (uint32_t) / 4;
3994         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3995         byte_width = 4 * width;
3996         stride *= 4;
3997     }
3998
3999     cache_prefetch ((__m128i*)byte_line);
4000     xmm_def = create_mask_2x32_128 (data, data);
4001
4002     while (height--)
4003     {
4004         int w;
4005         uint8_t *d = byte_line;
4006         byte_line += stride;
4007         w = byte_width;
4008
4009
4010         cache_prefetch_next ((__m128i*)d);
4011
4012         while (w >= 2 && ((unsigned long)d & 3))
4013         {
4014             *(uint16_t *)d = data;
4015             w -= 2;
4016             d += 2;
4017         }
4018
4019         while (w >= 4 && ((unsigned long)d & 15))
4020         {
4021             *(uint32_t *)d = data;
4022
4023             w -= 4;
4024             d += 4;
4025         }
4026
4027         cache_prefetch_next ((__m128i*)d);
4028
4029         while (w >= 128)
4030         {
4031             cache_prefetch (((__m128i*)d) + 12);
4032
4033             save_128_aligned ((__m128i*)(d),     xmm_def);
4034             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4035             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4036             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4037             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
4038             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
4039             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
4040             save_128_aligned ((__m128i*)(d + 112), xmm_def);
4041
4042             d += 128;
4043             w -= 128;
4044         }
4045
4046         if (w >= 64)
4047         {
4048             cache_prefetch (((__m128i*)d) + 8);
4049
4050             save_128_aligned ((__m128i*)(d),     xmm_def);
4051             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4052             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4053             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4054
4055             d += 64;
4056             w -= 64;
4057         }
4058
4059         cache_prefetch_next ((__m128i*)d);
4060
4061         if (w >= 32)
4062         {
4063             save_128_aligned ((__m128i*)(d),     xmm_def);
4064             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4065
4066             d += 32;
4067             w -= 32;
4068         }
4069
4070         if (w >= 16)
4071         {
4072             save_128_aligned ((__m128i*)(d),     xmm_def);
4073
4074             d += 16;
4075             w -= 16;
4076         }
4077
4078         cache_prefetch_next ((__m128i*)d);
4079
4080         while (w >= 4)
4081         {
4082             *(uint32_t *)d = data;
4083
4084             w -= 4;
4085             d += 4;
4086         }
4087
4088         if (w >= 2)
4089         {
4090             *(uint16_t *)d = data;
4091             w -= 2;
4092             d += 2;
4093         }
4094     }
4095
4096     _mm_empty ();
4097     return TRUE;
4098 }
4099
4100 static void
4101 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4102                              pixman_op_t              op,
4103                              pixman_image_t *         src_image,
4104                              pixman_image_t *         mask_image,
4105                              pixman_image_t *         dst_image,
4106                              int32_t                  src_x,
4107                              int32_t                  src_y,
4108                              int32_t                  mask_x,
4109                              int32_t                  mask_y,
4110                              int32_t                  dest_x,
4111                              int32_t                  dest_y,
4112                              int32_t                  width,
4113                              int32_t                  height)
4114 {
4115     uint32_t src, srca;
4116     uint32_t    *dst_line, *dst;
4117     uint8_t     *mask_line, *mask;
4118     int dst_stride, mask_stride;
4119     uint16_t w;
4120     uint32_t m;
4121
4122     __m128i xmm_src, xmm_def;
4123     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4124
4125     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4126
4127     srca = src >> 24;
4128     if (src == 0)
4129     {
4130         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4131                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
4132                           dest_x, dest_y, width, height, 0);
4133         return;
4134     }
4135
4136     PIXMAN_IMAGE_GET_LINE (
4137         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4138     PIXMAN_IMAGE_GET_LINE (
4139         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4140
4141     xmm_def = create_mask_2x32_128 (src, src);
4142     xmm_src = expand_pixel_32_1x128 (src);
4143
4144     while (height--)
4145     {
4146         dst = dst_line;
4147         dst_line += dst_stride;
4148         mask = mask_line;
4149         mask_line += mask_stride;
4150         w = width;
4151
4152         /* call prefetch hint to optimize cache load*/
4153         cache_prefetch ((__m128i*)mask);
4154         cache_prefetch ((__m128i*)dst);
4155
4156         while (w && (unsigned long)dst & 15)
4157         {
4158             uint8_t m = *mask++;
4159
4160             if (m)
4161             {
4162                 *dst = pack_1x64_32 (
4163                     pix_multiply_1x64 (
4164                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4165             }
4166             else
4167             {
4168                 *dst = 0;
4169             }
4170
4171             w--;
4172             dst++;
4173         }
4174
4175         /* call prefetch hint to optimize cache load*/
4176         cache_prefetch ((__m128i*)mask);
4177         cache_prefetch ((__m128i*)dst);
4178
4179         while (w >= 4)
4180         {
4181             /* fill cache line with next memory */
4182             cache_prefetch_next ((__m128i*)mask);
4183             cache_prefetch_next ((__m128i*)dst);
4184
4185             m = *((uint32_t*)mask);
4186
4187             if (srca == 0xff && m == 0xffffffff)
4188             {
4189                 save_128_aligned ((__m128i*)dst, xmm_def);
4190             }
4191             else if (m)
4192             {
4193                 xmm_mask = unpack_32_1x128 (m);
4194                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4195
4196                 /* Unpacking */
4197                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4198
4199                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4200                                         &xmm_mask_lo, &xmm_mask_hi);
4201
4202                 pix_multiply_2x128 (&xmm_src, &xmm_src,
4203                                     &xmm_mask_lo, &xmm_mask_hi,
4204                                     &xmm_mask_lo, &xmm_mask_hi);
4205
4206                 save_128_aligned (
4207                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4208             }
4209             else
4210             {
4211                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4212             }
4213
4214             w -= 4;
4215             dst += 4;
4216             mask += 4;
4217         }
4218
4219         while (w)
4220         {
4221             uint8_t m = *mask++;
4222
4223             if (m)
4224             {
4225                 *dst = pack_1x64_32 (
4226                     pix_multiply_1x64 (
4227                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4228             }
4229             else
4230             {
4231                 *dst = 0;
4232             }
4233
4234             w--;
4235             dst++;
4236         }
4237     }
4238
4239     _mm_empty ();
4240 }
4241
4242 /*-----------------------------------------------------------------------
4243  * composite_over_n_8_0565
4244  */
4245
4246 static void
4247 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4248                               pixman_op_t              op,
4249                               pixman_image_t *         src_image,
4250                               pixman_image_t *         mask_image,
4251                               pixman_image_t *         dst_image,
4252                               int32_t                  src_x,
4253                               int32_t                  src_y,
4254                               int32_t                  mask_x,
4255                               int32_t                  mask_y,
4256                               int32_t                  dest_x,
4257                               int32_t                  dest_y,
4258                               int32_t                  width,
4259                               int32_t                  height)
4260 {
4261     uint32_t src, srca;
4262     uint16_t    *dst_line, *dst, d;
4263     uint8_t     *mask_line, *mask;
4264     int dst_stride, mask_stride;
4265     uint16_t w;
4266     uint32_t m;
4267     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4268
4269     __m128i xmm_src, xmm_alpha;
4270     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4271     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4272
4273     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4274
4275     srca = src >> 24;
4276     if (src == 0)
4277         return;
4278
4279     PIXMAN_IMAGE_GET_LINE (
4280         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4281     PIXMAN_IMAGE_GET_LINE (
4282         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4283
4284     xmm_src = expand_pixel_32_1x128 (src);
4285     xmm_alpha = expand_alpha_1x128 (xmm_src);
4286     mmx_src = _mm_movepi64_pi64 (xmm_src);
4287     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4288
4289     while (height--)
4290     {
4291         dst = dst_line;
4292         dst_line += dst_stride;
4293         mask = mask_line;
4294         mask_line += mask_stride;
4295         w = width;
4296
4297         /* call prefetch hint to optimize cache load*/
4298         cache_prefetch ((__m128i*)mask);
4299         cache_prefetch ((__m128i*)dst);
4300
4301         while (w && (unsigned long)dst & 15)
4302         {
4303             m = *mask++;
4304
4305             if (m)
4306             {
4307                 d = *dst;
4308                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4309                 mmx_dest = expand565_16_1x64 (d);
4310
4311                 *dst = pack_565_32_16 (
4312                     pack_1x64_32 (
4313                         in_over_1x64 (
4314                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4315             }
4316
4317             w--;
4318             dst++;
4319         }
4320
4321         /* call prefetch hint to optimize cache load*/
4322         cache_prefetch ((__m128i*)mask);
4323         cache_prefetch ((__m128i*)dst);
4324
4325         while (w >= 8)
4326         {
4327             /* fill cache line with next memory */
4328             cache_prefetch_next ((__m128i*)mask);
4329             cache_prefetch_next ((__m128i*)dst);
4330
4331             xmm_dst = load_128_aligned ((__m128i*) dst);
4332             unpack_565_128_4x128 (xmm_dst,
4333                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4334
4335             m = *((uint32_t*)mask);
4336             mask += 4;
4337
4338             if (m)
4339             {
4340                 xmm_mask = unpack_32_1x128 (m);
4341                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4342
4343                 /* Unpacking */
4344                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4345
4346                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4347                                         &xmm_mask_lo, &xmm_mask_hi);
4348
4349                 in_over_2x128 (&xmm_src, &xmm_src,
4350                                &xmm_alpha, &xmm_alpha,
4351                                &xmm_mask_lo, &xmm_mask_hi,
4352                                &xmm_dst0, &xmm_dst1);
4353             }
4354
4355             m = *((uint32_t*)mask);
4356             mask += 4;
4357
4358             if (m)
4359             {
4360                 xmm_mask = unpack_32_1x128 (m);
4361                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4362
4363                 /* Unpacking */
4364                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4365
4366                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4367                                         &xmm_mask_lo, &xmm_mask_hi);
4368                 in_over_2x128 (&xmm_src, &xmm_src,
4369                                &xmm_alpha, &xmm_alpha,
4370                                &xmm_mask_lo, &xmm_mask_hi,
4371                                &xmm_dst2, &xmm_dst3);
4372             }
4373
4374             save_128_aligned (
4375                 (__m128i*)dst, pack_565_4x128_128 (
4376                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4377
4378             w -= 8;
4379             dst += 8;
4380         }
4381
4382         while (w)
4383         {
4384             m = *mask++;
4385
4386             if (m)
4387             {
4388                 d = *dst;
4389                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4390                 mmx_dest = expand565_16_1x64 (d);
4391
4392                 *dst = pack_565_32_16 (
4393                     pack_1x64_32 (
4394                         in_over_1x64 (
4395                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4396             }
4397
4398             w--;
4399             dst++;
4400         }
4401     }
4402
4403     _mm_empty ();
4404 }
4405
4406 /* -----------------------------------------------------------------------
4407  * composite_over_pixbuf_0565
4408  */
4409
4410 static void
4411 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4412                                  pixman_op_t              op,
4413                                  pixman_image_t *         src_image,
4414                                  pixman_image_t *         mask_image,
4415                                  pixman_image_t *         dst_image,
4416                                  int32_t                  src_x,
4417                                  int32_t                  src_y,
4418                                  int32_t                  mask_x,
4419                                  int32_t                  mask_y,
4420                                  int32_t                  dest_x,
4421                                  int32_t                  dest_y,
4422                                  int32_t                  width,
4423                                  int32_t                  height)
4424 {
4425     uint16_t    *dst_line, *dst, d;
4426     uint32_t    *src_line, *src, s;
4427     int dst_stride, src_stride;
4428     uint16_t w;
4429     uint32_t opaque, zero;
4430
4431     __m64 ms;
4432     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4433     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4434
4435     PIXMAN_IMAGE_GET_LINE (
4436         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4437     PIXMAN_IMAGE_GET_LINE (
4438         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4439
4440 #if 0
4441     /* FIXME
4442      *
4443      * I copy the code from MMX one and keep the fixme.
4444      * If it's a problem there, probably is a problem here.
4445      */
4446     assert (src_image->drawable == mask_image->drawable);
4447 #endif
4448
4449     while (height--)
4450     {
4451         dst = dst_line;
4452         dst_line += dst_stride;
4453         src = src_line;
4454         src_line += src_stride;
4455         w = width;
4456
4457         /* call prefetch hint to optimize cache load*/
4458         cache_prefetch ((__m128i*)src);
4459         cache_prefetch ((__m128i*)dst);
4460
4461         while (w && (unsigned long)dst & 15)
4462         {
4463             s = *src++;
4464             d = *dst;
4465
4466             ms = unpack_32_1x64 (s);
4467
4468             *dst++ = pack_565_32_16 (
4469                 pack_1x64_32 (
4470                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4471             w--;
4472         }
4473
4474         /* call prefetch hint to optimize cache load*/
4475         cache_prefetch ((__m128i*)src);
4476         cache_prefetch ((__m128i*)dst);
4477
4478         while (w >= 8)
4479         {
4480             /* fill cache line with next memory */
4481             cache_prefetch_next ((__m128i*)src);
4482             cache_prefetch_next ((__m128i*)dst);
4483
4484             /* First round */
4485             xmm_src = load_128_unaligned ((__m128i*)src);
4486             xmm_dst = load_128_aligned  ((__m128i*)dst);
4487
4488             opaque = is_opaque (xmm_src);
4489             zero = is_zero (xmm_src);
4490
4491             unpack_565_128_4x128 (xmm_dst,
4492                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4493             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4494
4495             /* preload next round*/
4496             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4497
4498             if (opaque)
4499             {
4500                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4501                                      &xmm_dst0, &xmm_dst1);
4502             }
4503             else if (!zero)
4504             {
4505                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4506                                         &xmm_dst0, &xmm_dst1);
4507             }
4508
4509             /* Second round */
4510             opaque = is_opaque (xmm_src);
4511             zero = is_zero (xmm_src);
4512
4513             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4514
4515             if (opaque)
4516             {
4517                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4518                                      &xmm_dst2, &xmm_dst3);
4519             }
4520             else if (!zero)
4521             {
4522                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4523                                         &xmm_dst2, &xmm_dst3);
4524             }
4525
4526             save_128_aligned (
4527                 (__m128i*)dst, pack_565_4x128_128 (
4528                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4529
4530             w -= 8;
4531             src += 8;
4532             dst += 8;
4533         }
4534
4535         while (w)
4536         {
4537             s = *src++;
4538             d = *dst;
4539
4540             ms = unpack_32_1x64 (s);
4541
4542             *dst++ = pack_565_32_16 (
4543                 pack_1x64_32 (
4544                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4545             w--;
4546         }
4547     }
4548
4549     _mm_empty ();
4550 }
4551
4552 /* -------------------------------------------------------------------------
4553  * composite_over_pixbuf_8888
4554  */
4555
4556 static void
4557 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4558                                  pixman_op_t              op,
4559                                  pixman_image_t *         src_image,
4560                                  pixman_image_t *         mask_image,
4561                                  pixman_image_t *         dst_image,
4562                                  int32_t                  src_x,
4563                                  int32_t                  src_y,
4564                                  int32_t                  mask_x,
4565                                  int32_t                  mask_y,
4566                                  int32_t                  dest_x,
4567                                  int32_t                  dest_y,
4568                                  int32_t                  width,
4569                                  int32_t                  height)
4570 {
4571     uint32_t    *dst_line, *dst, d;
4572     uint32_t    *src_line, *src, s;
4573     int dst_stride, src_stride;
4574     uint16_t w;
4575     uint32_t opaque, zero;
4576
4577     __m128i xmm_src_lo, xmm_src_hi;
4578     __m128i xmm_dst_lo, xmm_dst_hi;
4579
4580     PIXMAN_IMAGE_GET_LINE (
4581         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4582     PIXMAN_IMAGE_GET_LINE (
4583         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4584
4585 #if 0
4586     /* FIXME
4587      *
4588      * I copy the code from MMX one and keep the fixme.
4589      * If it's a problem there, probably is a problem here.
4590      */
4591     assert (src_image->drawable == mask_image->drawable);
4592 #endif
4593
4594     while (height--)
4595     {
4596         dst = dst_line;
4597         dst_line += dst_stride;
4598         src = src_line;
4599         src_line += src_stride;
4600         w = width;
4601
4602         /* call prefetch hint to optimize cache load*/
4603         cache_prefetch ((__m128i*)src);
4604         cache_prefetch ((__m128i*)dst);
4605
4606         while (w && (unsigned long)dst & 15)
4607         {
4608             s = *src++;
4609             d = *dst;
4610
4611             *dst++ = pack_1x64_32 (
4612                 over_rev_non_pre_1x64 (
4613                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4614
4615             w--;
4616         }
4617
4618         /* call prefetch hint to optimize cache load*/
4619         cache_prefetch ((__m128i*)src);
4620         cache_prefetch ((__m128i*)dst);
4621
4622         while (w >= 4)
4623         {
4624             /* fill cache line with next memory */
4625             cache_prefetch_next ((__m128i*)src);
4626             cache_prefetch_next ((__m128i*)dst);
4627
4628             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4629
4630             opaque = is_opaque (xmm_src_hi);
4631             zero = is_zero (xmm_src_hi);
4632
4633             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4634
4635             if (opaque)
4636             {
4637                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4638                                      &xmm_dst_lo, &xmm_dst_hi);
4639
4640                 save_128_aligned (
4641                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4642             }
4643             else if (!zero)
4644             {
4645                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4646
4647                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4648
4649                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4650                                         &xmm_dst_lo, &xmm_dst_hi);
4651
4652                 save_128_aligned (
4653                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4654             }
4655
4656             w -= 4;
4657             dst += 4;
4658             src += 4;
4659         }
4660
4661         while (w)
4662         {
4663             s = *src++;
4664             d = *dst;
4665
4666             *dst++ = pack_1x64_32 (
4667                 over_rev_non_pre_1x64 (
4668                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4669
4670             w--;
4671         }
4672     }
4673
4674     _mm_empty ();
4675 }
4676
4677 /* -------------------------------------------------------------------------------------------------
4678  * composite_over_n_8888_0565_ca
4679  */
4680
4681 static void
4682 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4683                                     pixman_op_t              op,
4684                                     pixman_image_t *         src_image,
4685                                     pixman_image_t *         mask_image,
4686                                     pixman_image_t *         dst_image,
4687                                     int32_t                  src_x,
4688                                     int32_t                  src_y,
4689                                     int32_t                  mask_x,
4690                                     int32_t                  mask_y,
4691                                     int32_t                  dest_x,
4692                                     int32_t                  dest_y,
4693                                     int32_t                  width,
4694                                     int32_t                  height)
4695 {
4696     uint32_t src;
4697     uint16_t    *dst_line, *dst, d;
4698     uint32_t    *mask_line, *mask, m;
4699     int dst_stride, mask_stride;
4700     int w;
4701     uint32_t pack_cmp;
4702
4703     __m128i xmm_src, xmm_alpha;
4704     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4705     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4706
4707     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4708
4709     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4710
4711     if (src == 0)
4712         return;
4713
4714     PIXMAN_IMAGE_GET_LINE (
4715         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4716     PIXMAN_IMAGE_GET_LINE (
4717         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4718
4719     xmm_src = expand_pixel_32_1x128 (src);
4720     xmm_alpha = expand_alpha_1x128 (xmm_src);
4721     mmx_src = _mm_movepi64_pi64 (xmm_src);
4722     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4723
4724     while (height--)
4725     {
4726         w = width;
4727         mask = mask_line;
4728         dst = dst_line;
4729         mask_line += mask_stride;
4730         dst_line += dst_stride;
4731
4732         /* call prefetch hint to optimize cache load*/
4733         cache_prefetch ((__m128i*)mask);
4734         cache_prefetch ((__m128i*)dst);
4735
4736         while (w && ((unsigned long)dst & 15))
4737         {
4738             m = *(uint32_t *) mask;
4739
4740             if (m)
4741             {
4742                 d = *dst;
4743                 mmx_mask = unpack_32_1x64 (m);
4744                 mmx_dest = expand565_16_1x64 (d);
4745
4746                 *dst = pack_565_32_16 (
4747                     pack_1x64_32 (
4748                         in_over_1x64 (
4749                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4750             }
4751
4752             w--;
4753             dst++;
4754             mask++;
4755         }
4756
4757         /* call prefetch hint to optimize cache load*/
4758         cache_prefetch ((__m128i*)mask);
4759         cache_prefetch ((__m128i*)dst);
4760
4761         while (w >= 8)
4762         {
4763             /* fill cache line with next memory */
4764             cache_prefetch_next ((__m128i*)mask);
4765             cache_prefetch_next ((__m128i*)dst);
4766
4767             /* First round */
4768             xmm_mask = load_128_unaligned ((__m128i*)mask);
4769             xmm_dst = load_128_aligned ((__m128i*)dst);
4770
4771             pack_cmp = _mm_movemask_epi8 (
4772                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4773
4774             unpack_565_128_4x128 (xmm_dst,
4775                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4776             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4777
4778             /* preload next round */
4779             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4780
4781             /* preload next round */
4782             if (pack_cmp != 0xffff)
4783             {
4784                 in_over_2x128 (&xmm_src, &xmm_src,
4785                                &xmm_alpha, &xmm_alpha,
4786                                &xmm_mask_lo, &xmm_mask_hi,
4787                                &xmm_dst0, &xmm_dst1);
4788             }
4789
4790             /* Second round */
4791             pack_cmp = _mm_movemask_epi8 (
4792                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4793
4794             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4795
4796             if (pack_cmp != 0xffff)
4797             {
4798                 in_over_2x128 (&xmm_src, &xmm_src,
4799                                &xmm_alpha, &xmm_alpha,
4800                                &xmm_mask_lo, &xmm_mask_hi,
4801                                &xmm_dst2, &xmm_dst3);
4802             }
4803
4804             save_128_aligned (
4805                 (__m128i*)dst, pack_565_4x128_128 (
4806                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4807
4808             w -= 8;
4809             dst += 8;
4810             mask += 8;
4811         }
4812
4813         while (w)
4814         {
4815             m = *(uint32_t *) mask;
4816
4817             if (m)
4818             {
4819                 d = *dst;
4820                 mmx_mask = unpack_32_1x64 (m);
4821                 mmx_dest = expand565_16_1x64 (d);
4822
4823                 *dst = pack_565_32_16 (
4824                     pack_1x64_32 (
4825                         in_over_1x64 (
4826                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4827             }
4828
4829             w--;
4830             dst++;
4831             mask++;
4832         }
4833     }
4834
4835     _mm_empty ();
4836 }
4837
4838 /* -----------------------------------------------------------------------
4839  * composite_in_n_8_8
4840  */
4841
4842 static void
4843 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4844                          pixman_op_t              op,
4845                          pixman_image_t *         src_image,
4846                          pixman_image_t *         mask_image,
4847                          pixman_image_t *         dst_image,
4848                          int32_t                  src_x,
4849                          int32_t                  src_y,
4850                          int32_t                  mask_x,
4851                          int32_t                  mask_y,
4852                          int32_t                  dest_x,
4853                          int32_t                  dest_y,
4854                          int32_t                  width,
4855                          int32_t                  height)
4856 {
4857     uint8_t     *dst_line, *dst;
4858     uint8_t     *mask_line, *mask;
4859     int dst_stride, mask_stride;
4860     uint16_t w, d, m;
4861     uint32_t src;
4862     uint8_t sa;
4863
4864     __m128i xmm_alpha;
4865     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4866     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4867
4868     PIXMAN_IMAGE_GET_LINE (
4869         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4870     PIXMAN_IMAGE_GET_LINE (
4871         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4872
4873     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4874
4875     sa = src >> 24;
4876
4877     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4878
4879     while (height--)
4880     {
4881         dst = dst_line;
4882         dst_line += dst_stride;
4883         mask = mask_line;
4884         mask_line += mask_stride;
4885         w = width;
4886
4887         /* call prefetch hint to optimize cache load*/
4888         cache_prefetch ((__m128i*)mask);
4889         cache_prefetch ((__m128i*)dst);
4890
4891         while (w && ((unsigned long)dst & 15))
4892         {
4893             m = (uint32_t) *mask++;
4894             d = (uint32_t) *dst;
4895
4896             *dst++ = (uint8_t) pack_1x64_32 (
4897                 pix_multiply_1x64 (
4898                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4899                                        unpack_32_1x64 (m)),
4900                     unpack_32_1x64 (d)));
4901             w--;
4902         }
4903
4904         /* call prefetch hint to optimize cache load*/
4905         cache_prefetch ((__m128i*)mask);
4906         cache_prefetch ((__m128i*)dst);
4907
4908         while (w >= 16)
4909         {
4910             /* fill cache line with next memory */
4911             cache_prefetch_next ((__m128i*)mask);
4912             cache_prefetch_next ((__m128i*)dst);
4913
4914             xmm_mask = load_128_unaligned ((__m128i*)mask);
4915             xmm_dst = load_128_aligned ((__m128i*)dst);
4916
4917             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4918             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4919
4920             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4921                                 &xmm_mask_lo, &xmm_mask_hi,
4922                                 &xmm_mask_lo, &xmm_mask_hi);
4923
4924             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4925                                 &xmm_dst_lo, &xmm_dst_hi,
4926                                 &xmm_dst_lo, &xmm_dst_hi);
4927
4928             save_128_aligned (
4929                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4930
4931             mask += 16;
4932             dst += 16;
4933             w -= 16;
4934         }
4935
4936         while (w)
4937         {
4938             m = (uint32_t) *mask++;
4939             d = (uint32_t) *dst;
4940
4941             *dst++ = (uint8_t) pack_1x64_32 (
4942                 pix_multiply_1x64 (
4943                     pix_multiply_1x64 (
4944                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4945                     unpack_32_1x64 (d)));
4946             w--;
4947         }
4948     }
4949
4950     _mm_empty ();
4951 }
4952
4953 /* ---------------------------------------------------------------------------
4954  * composite_in_8_8
4955  */
4956
4957 static void
4958 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4959                        pixman_op_t              op,
4960                        pixman_image_t *         src_image,
4961                        pixman_image_t *         mask_image,
4962                        pixman_image_t *         dst_image,
4963                        int32_t                  src_x,
4964                        int32_t                  src_y,
4965                        int32_t                  mask_x,
4966                        int32_t                  mask_y,
4967                        int32_t                  dest_x,
4968                        int32_t                  dest_y,
4969                        int32_t                  width,
4970                        int32_t                  height)
4971 {
4972     uint8_t     *dst_line, *dst;
4973     uint8_t     *src_line, *src;
4974     int src_stride, dst_stride;
4975     uint16_t w;
4976     uint32_t s, d;
4977
4978     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4979     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4980
4981     PIXMAN_IMAGE_GET_LINE (
4982         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4983     PIXMAN_IMAGE_GET_LINE (
4984         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4985
4986     while (height--)
4987     {
4988         dst = dst_line;
4989         dst_line += dst_stride;
4990         src = src_line;
4991         src_line += src_stride;
4992         w = width;
4993
4994         /* call prefetch hint to optimize cache load*/
4995         cache_prefetch ((__m128i*)src);
4996         cache_prefetch ((__m128i*)dst);
4997
4998         while (w && ((unsigned long)dst & 15))
4999         {
5000             s = (uint32_t) *src++;
5001             d = (uint32_t) *dst;
5002
5003             *dst++ = (uint8_t) pack_1x64_32 (
5004                 pix_multiply_1x64 (
5005                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
5006             w--;
5007         }
5008
5009         /* call prefetch hint to optimize cache load*/
5010         cache_prefetch ((__m128i*)src);
5011         cache_prefetch ((__m128i*)dst);
5012
5013         while (w >= 16)
5014         {
5015             /* fill cache line with next memory */
5016             cache_prefetch_next ((__m128i*)src);
5017             cache_prefetch_next ((__m128i*)dst);
5018
5019             xmm_src = load_128_unaligned ((__m128i*)src);
5020             xmm_dst = load_128_aligned ((__m128i*)dst);
5021
5022             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5023             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5024
5025             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5026                                 &xmm_dst_lo, &xmm_dst_hi,
5027                                 &xmm_dst_lo, &xmm_dst_hi);
5028
5029             save_128_aligned (
5030                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5031
5032             src += 16;
5033             dst += 16;
5034             w -= 16;
5035         }
5036
5037         while (w)
5038         {
5039             s = (uint32_t) *src++;
5040             d = (uint32_t) *dst;
5041
5042             *dst++ = (uint8_t) pack_1x64_32 (
5043                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5044             w--;
5045         }
5046     }
5047
5048     _mm_empty ();
5049 }
5050
5051 /* -------------------------------------------------------------------------
5052  * composite_add_n_8_8
5053  */
5054
5055 static void
5056 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
5057                           pixman_op_t              op,
5058                           pixman_image_t *         src_image,
5059                           pixman_image_t *         mask_image,
5060                           pixman_image_t *         dst_image,
5061                           int32_t                  src_x,
5062                           int32_t                  src_y,
5063                           int32_t                  mask_x,
5064                           int32_t                  mask_y,
5065                           int32_t                  dest_x,
5066                           int32_t                  dest_y,
5067                           int32_t                  width,
5068                           int32_t                  height)
5069 {
5070     uint8_t     *dst_line, *dst;
5071     uint8_t     *mask_line, *mask;
5072     int dst_stride, mask_stride;
5073     uint16_t w;
5074     uint32_t src;
5075     uint8_t sa;
5076     uint32_t m, d;
5077
5078     __m128i xmm_alpha;
5079     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5080     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5081
5082     PIXMAN_IMAGE_GET_LINE (
5083         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5084     PIXMAN_IMAGE_GET_LINE (
5085         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5086
5087     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5088
5089     sa = src >> 24;
5090
5091     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5092
5093     while (height--)
5094     {
5095         dst = dst_line;
5096         dst_line += dst_stride;
5097         mask = mask_line;
5098         mask_line += mask_stride;
5099         w = width;
5100
5101         /* call prefetch hint to optimize cache load*/
5102         cache_prefetch ((__m128i*)mask);
5103         cache_prefetch ((__m128i*)dst);
5104
5105         while (w && ((unsigned long)dst & 15))
5106         {
5107             m = (uint32_t) *mask++;
5108             d = (uint32_t) *dst;
5109
5110             *dst++ = (uint8_t) pack_1x64_32 (
5111                 _mm_adds_pu16 (
5112                     pix_multiply_1x64 (
5113                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5114                     unpack_32_1x64 (d)));
5115             w--;
5116         }
5117
5118         /* call prefetch hint to optimize cache load*/
5119         cache_prefetch ((__m128i*)mask);
5120         cache_prefetch ((__m128i*)dst);
5121
5122         while (w >= 16)
5123         {
5124             /* fill cache line with next memory */
5125             cache_prefetch_next ((__m128i*)mask);
5126             cache_prefetch_next ((__m128i*)dst);
5127
5128             xmm_mask = load_128_unaligned ((__m128i*)mask);
5129             xmm_dst = load_128_aligned ((__m128i*)dst);
5130
5131             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5132             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5133
5134             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5135                                 &xmm_mask_lo, &xmm_mask_hi,
5136                                 &xmm_mask_lo, &xmm_mask_hi);
5137
5138             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5139             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5140
5141             save_128_aligned (
5142                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5143
5144             mask += 16;
5145             dst += 16;
5146             w -= 16;
5147         }
5148
5149         while (w)
5150         {
5151             m = (uint32_t) *mask++;
5152             d = (uint32_t) *dst;
5153
5154             *dst++ = (uint8_t) pack_1x64_32 (
5155                 _mm_adds_pu16 (
5156                     pix_multiply_1x64 (
5157                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5158                     unpack_32_1x64 (d)));
5159
5160             w--;
5161         }
5162     }
5163
5164     _mm_empty ();
5165 }
5166
5167 /* ----------------------------------------------------------------------
5168  * composite_add_8000_8000
5169  */
5170
5171 static void
5172 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5173                               pixman_op_t              op,
5174                               pixman_image_t *         src_image,
5175                               pixman_image_t *         mask_image,
5176                               pixman_image_t *         dst_image,
5177                               int32_t                  src_x,
5178                               int32_t                  src_y,
5179                               int32_t                  mask_x,
5180                               int32_t                  mask_y,
5181                               int32_t                  dest_x,
5182                               int32_t                  dest_y,
5183                               int32_t                  width,
5184                               int32_t                  height)
5185 {
5186     uint8_t     *dst_line, *dst;
5187     uint8_t     *src_line, *src;
5188     int dst_stride, src_stride;
5189     uint16_t w;
5190     uint16_t t;
5191
5192     PIXMAN_IMAGE_GET_LINE (
5193         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5194     PIXMAN_IMAGE_GET_LINE (
5195         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5196
5197     while (height--)
5198     {
5199         dst = dst_line;
5200         src = src_line;
5201
5202         /* call prefetch hint to optimize cache load*/
5203         cache_prefetch ((__m128i*)src);
5204         cache_prefetch ((__m128i*)dst);
5205
5206         dst_line += dst_stride;
5207         src_line += src_stride;
5208         w = width;
5209
5210         /* Small head */
5211         while (w && (unsigned long)dst & 3)
5212         {
5213             t = (*dst) + (*src++);
5214             *dst++ = t | (0 - (t >> 8));
5215             w--;
5216         }
5217
5218         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5219
5220         /* Small tail */
5221         dst += w & 0xfffc;
5222         src += w & 0xfffc;
5223
5224         w &= 3;
5225
5226         while (w)
5227         {
5228             t = (*dst) + (*src++);
5229             *dst++ = t | (0 - (t >> 8));
5230             w--;
5231         }
5232     }
5233
5234     _mm_empty ();
5235 }
5236
5237 /* ---------------------------------------------------------------------
5238  * composite_add_8888_8888
5239  */
5240 static void
5241 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5242                               pixman_op_t              op,
5243                               pixman_image_t *         src_image,
5244                               pixman_image_t *         mask_image,
5245                               pixman_image_t *         dst_image,
5246                               int32_t                  src_x,
5247                               int32_t                  src_y,
5248                               int32_t                  mask_x,
5249                               int32_t                  mask_y,
5250                               int32_t                  dest_x,
5251                               int32_t                  dest_y,
5252                               int32_t                  width,
5253                               int32_t                  height)
5254 {
5255     uint32_t    *dst_line, *dst;
5256     uint32_t    *src_line, *src;
5257     int dst_stride, src_stride;
5258
5259     PIXMAN_IMAGE_GET_LINE (
5260         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5261     PIXMAN_IMAGE_GET_LINE (
5262         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5263
5264     while (height--)
5265     {
5266         dst = dst_line;
5267         dst_line += dst_stride;
5268         src = src_line;
5269         src_line += src_stride;
5270
5271         core_combine_add_u_sse2 (dst, src, NULL, width);
5272     }
5273
5274     _mm_empty ();
5275 }
5276
5277 /* -------------------------------------------------------------------------------------------------
5278  * sse2_composite_copy_area
5279  */
5280
5281 static pixman_bool_t
5282 pixman_blt_sse2 (uint32_t *src_bits,
5283                  uint32_t *dst_bits,
5284                  int       src_stride,
5285                  int       dst_stride,
5286                  int       src_bpp,
5287                  int       dst_bpp,
5288                  int       src_x,
5289                  int       src_y,
5290                  int       dst_x,
5291                  int       dst_y,
5292                  int       width,
5293                  int       height)
5294 {
5295     uint8_t *   src_bytes;
5296     uint8_t *   dst_bytes;
5297     int byte_width;
5298
5299     if (src_bpp != dst_bpp)
5300         return FALSE;
5301
5302     if (src_bpp == 16)
5303     {
5304         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5305         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5306         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5307         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5308         byte_width = 2 * width;
5309         src_stride *= 2;
5310         dst_stride *= 2;
5311     }
5312     else if (src_bpp == 32)
5313     {
5314         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5315         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5316         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5317         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5318         byte_width = 4 * width;
5319         src_stride *= 4;
5320         dst_stride *= 4;
5321     }
5322     else
5323     {
5324         return FALSE;
5325     }
5326
5327     cache_prefetch ((__m128i*)src_bytes);
5328     cache_prefetch ((__m128i*)dst_bytes);
5329
5330     while (height--)
5331     {
5332         int w;
5333         uint8_t *s = src_bytes;
5334         uint8_t *d = dst_bytes;
5335         src_bytes += src_stride;
5336         dst_bytes += dst_stride;
5337         w = byte_width;
5338
5339         cache_prefetch_next ((__m128i*)s);
5340         cache_prefetch_next ((__m128i*)d);
5341
5342         while (w >= 2 && ((unsigned long)d & 3))
5343         {
5344             *(uint16_t *)d = *(uint16_t *)s;
5345             w -= 2;
5346             s += 2;
5347             d += 2;
5348         }
5349
5350         while (w >= 4 && ((unsigned long)d & 15))
5351         {
5352             *(uint32_t *)d = *(uint32_t *)s;
5353
5354             w -= 4;
5355             s += 4;
5356             d += 4;
5357         }
5358
5359         cache_prefetch_next ((__m128i*)s);
5360         cache_prefetch_next ((__m128i*)d);
5361
5362         while (w >= 64)
5363         {
5364             __m128i xmm0, xmm1, xmm2, xmm3;
5365
5366             /* 128 bytes ahead */
5367             cache_prefetch (((__m128i*)s) + 8);
5368             cache_prefetch (((__m128i*)d) + 8);
5369
5370             xmm0 = load_128_unaligned ((__m128i*)(s));
5371             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5372             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5373             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5374
5375             save_128_aligned ((__m128i*)(d),    xmm0);
5376             save_128_aligned ((__m128i*)(d + 16), xmm1);
5377             save_128_aligned ((__m128i*)(d + 32), xmm2);
5378             save_128_aligned ((__m128i*)(d + 48), xmm3);
5379
5380             s += 64;
5381             d += 64;
5382             w -= 64;
5383         }
5384
5385         cache_prefetch_next ((__m128i*)s);
5386         cache_prefetch_next ((__m128i*)d);
5387
5388         while (w >= 16)
5389         {
5390             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5391
5392             w -= 16;
5393             d += 16;
5394             s += 16;
5395         }
5396
5397         cache_prefetch_next ((__m128i*)s);
5398         cache_prefetch_next ((__m128i*)d);
5399
5400         while (w >= 4)
5401         {
5402             *(uint32_t *)d = *(uint32_t *)s;
5403
5404             w -= 4;
5405             s += 4;
5406             d += 4;
5407         }
5408
5409         if (w >= 2)
5410         {
5411             *(uint16_t *)d = *(uint16_t *)s;
5412             w -= 2;
5413             s += 2;
5414             d += 2;
5415         }
5416     }
5417
5418     _mm_empty ();
5419
5420     return TRUE;
5421 }
5422
5423 static void
5424 sse2_composite_copy_area (pixman_implementation_t *imp,
5425                           pixman_op_t              op,
5426                           pixman_image_t *         src_image,
5427                           pixman_image_t *         mask_image,
5428                           pixman_image_t *         dst_image,
5429                           int32_t                  src_x,
5430                           int32_t                  src_y,
5431                           int32_t                  mask_x,
5432                           int32_t                  mask_y,
5433                           int32_t                  dest_x,
5434                           int32_t                  dest_y,
5435                           int32_t                  width,
5436                           int32_t                  height)
5437 {
5438     pixman_blt_sse2 (src_image->bits.bits,
5439                      dst_image->bits.bits,
5440                      src_image->bits.rowstride,
5441                      dst_image->bits.rowstride,
5442                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5443                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5444                      src_x, src_y, dest_x, dest_y, width, height);
5445 }
5446
5447 static void
5448 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5449                                  pixman_op_t              op,
5450                                  pixman_image_t *         src_image,
5451                                  pixman_image_t *         mask_image,
5452                                  pixman_image_t *         dst_image,
5453                                  int32_t                  src_x,
5454                                  int32_t                  src_y,
5455                                  int32_t                  mask_x,
5456                                  int32_t                  mask_y,
5457                                  int32_t                  dest_x,
5458                                  int32_t                  dest_y,
5459                                  int32_t                  width,
5460                                  int32_t                  height)
5461 {
5462     uint32_t    *src, *src_line, s;
5463     uint32_t    *dst, *dst_line, d;
5464     uint8_t         *mask, *mask_line;
5465     uint32_t m;
5466     int src_stride, mask_stride, dst_stride;
5467     uint16_t w;
5468
5469     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5470     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5471     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5472
5473     PIXMAN_IMAGE_GET_LINE (
5474         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5475     PIXMAN_IMAGE_GET_LINE (
5476         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5477     PIXMAN_IMAGE_GET_LINE (
5478         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5479
5480     while (height--)
5481     {
5482         src = src_line;
5483         src_line += src_stride;
5484         dst = dst_line;
5485         dst_line += dst_stride;
5486         mask = mask_line;
5487         mask_line += mask_stride;
5488
5489         w = width;
5490
5491         /* call prefetch hint to optimize cache load*/
5492         cache_prefetch ((__m128i*)src);
5493         cache_prefetch ((__m128i*)dst);
5494         cache_prefetch ((__m128i*)mask);
5495
5496         while (w && (unsigned long)dst & 15)
5497         {
5498             s = 0xff000000 | *src++;
5499             m = (uint32_t) *mask++;
5500             d = *dst;
5501
5502             __m64 ms = unpack_32_1x64 (s);
5503
5504             if (m != 0xff)
5505             {
5506                 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5507                 __m64 md = unpack_32_1x64 (d);
5508
5509                 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5510             }
5511
5512             *dst++ = pack_1x64_32 (ms);
5513             w--;
5514         }
5515
5516         /* call prefetch hint to optimize cache load*/
5517         cache_prefetch ((__m128i*)src);
5518         cache_prefetch ((__m128i*)dst);
5519         cache_prefetch ((__m128i*)mask);
5520
5521         while (w >= 4)
5522         {
5523             /* fill cache line with next memory */
5524             cache_prefetch_next ((__m128i*)src);
5525             cache_prefetch_next ((__m128i*)dst);
5526             cache_prefetch_next ((__m128i*)mask);
5527
5528             m = *(uint32_t*) mask;
5529             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5530
5531             if (m == 0xffffffff)
5532             {
5533                 save_128_aligned ((__m128i*)dst, xmm_src);
5534             }
5535             else
5536             {
5537                 xmm_dst = load_128_aligned ((__m128i*)dst);
5538
5539                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5540
5541                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5542                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5543                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5544
5545                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5546
5547                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5548
5549                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5550             }
5551
5552             src += 4;
5553             dst += 4;
5554             mask += 4;
5555             w -= 4;
5556         }
5557
5558         while (w)
5559         {
5560             m = (uint32_t) *mask++;
5561
5562             if (m)
5563             {
5564                 s = 0xff000000 | *src;
5565
5566                 if (m == 0xff)
5567                 {
5568                     *dst = s;
5569                 }
5570                 else
5571                 {
5572                     __m64 ma, md, ms;
5573
5574                     d = *dst;
5575
5576                     ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5577                     md = unpack_32_1x64 (d);
5578                     ms = unpack_32_1x64 (s);
5579
5580                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5581                 }
5582
5583             }
5584
5585             src++;
5586             dst++;
5587             w--;
5588         }
5589     }
5590
5591     _mm_empty ();
5592 }
5593
5594 static const pixman_fast_path_t sse2_fast_paths[] =
5595 {
5596     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   sse2_composite_over_n_8_0565,       0 },
5597     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   sse2_composite_over_n_8_0565,       0 },
5598     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_n_8888,         0 },
5599     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_n_8888,         0 },
5600     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_n_0565,         0 },
5601     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888,      0 },
5602     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888,      0 },
5603     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888,      0 },
5604     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888,      0 },
5605     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_8888_0565,      0 },
5606     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_over_8888_0565,      0 },
5607     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888,       0 },
5608     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888,       0 },
5609     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888,       0 },
5610     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888,       0 },
5611     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
5612     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
5613     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
5614     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
5615     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5616     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5617     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5618     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5619     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5620     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5621     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5622     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5623     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5624     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5625     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5626     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5627     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5628     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5629     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5630     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5631     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5632     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5633     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5634     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5635     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5636     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5637     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5638     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5639     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5640     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5641     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5642     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5643
5644     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca,  NEED_COMPONENT_ALPHA },
5645     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       sse2_composite_add_8000_8000,       0 },
5646     { PIXMAN_OP_ADD,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888,       0 },
5647     { PIXMAN_OP_ADD,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888,       0 },
5648     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       sse2_composite_add_n_8_8,           0 },
5649
5650     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888,        0 },
5651     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888,        0 },
5652     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888,        0 },
5653     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888,        0 },
5654     { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_copy_area,           0 },
5655     { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_copy_area,           0 },
5656     { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5657     { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5658     { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5659     { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5660     { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_copy_area,           0 },
5661     { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_copy_area,           0 },
5662
5663     { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       sse2_composite_in_8_8,              0 },
5664     { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       sse2_composite_in_n_8_8,            0 },
5665
5666     { PIXMAN_OP_NONE },
5667 };
5668
5669 /*
5670  * Work around GCC bug causing crashes in Mozilla with SSE2
5671  *
5672  * When using -msse, gcc generates movdqa instructions assuming that
5673  * the stack is 16 byte aligned. Unfortunately some applications, such
5674  * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
5675  * causes the movdqa instructions to fail.
5676  *
5677  * The __force_align_arg_pointer__ makes gcc generate a prologue that
5678  * realigns the stack pointer to 16 bytes.
5679  *
5680  * On x86-64 this is not necessary because the standard ABI already
5681  * calls for a 16 byte aligned stack.
5682  *
5683  * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
5684  */
5685 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5686 __attribute__((__force_align_arg_pointer__))
5687 #endif
5688 static void
5689 sse2_composite (pixman_implementation_t *imp,
5690                 pixman_op_t              op,
5691                 pixman_image_t *         src,
5692                 pixman_image_t *         mask,
5693                 pixman_image_t *         dest,
5694                 int32_t                  src_x,
5695                 int32_t                  src_y,
5696                 int32_t                  mask_x,
5697                 int32_t                  mask_y,
5698                 int32_t                  dest_x,
5699                 int32_t                  dest_y,
5700                 int32_t                  width,
5701                 int32_t                  height)
5702 {
5703     if (_pixman_run_fast_path (sse2_fast_paths, imp,
5704                                op, src, mask, dest,
5705                                src_x, src_y,
5706                                mask_x, mask_y,
5707                                dest_x, dest_y,
5708                                width, height))
5709     {
5710         return;
5711     }
5712
5713     _pixman_implementation_composite (imp->delegate, op,
5714                                       src, mask, dest,
5715                                       src_x, src_y,
5716                                       mask_x, mask_y,
5717                                       dest_x, dest_y,
5718                                       width, height);
5719 }
5720
5721 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5722 __attribute__((__force_align_arg_pointer__))
5723 #endif
5724 static pixman_bool_t
5725 sse2_blt (pixman_implementation_t *imp,
5726           uint32_t *               src_bits,
5727           uint32_t *               dst_bits,
5728           int                      src_stride,
5729           int                      dst_stride,
5730           int                      src_bpp,
5731           int                      dst_bpp,
5732           int                      src_x,
5733           int                      src_y,
5734           int                      dst_x,
5735           int                      dst_y,
5736           int                      width,
5737           int                      height)
5738 {
5739     if (!pixman_blt_sse2 (
5740             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5741             src_x, src_y, dst_x, dst_y, width, height))
5742
5743     {
5744         return _pixman_implementation_blt (
5745             imp->delegate,
5746             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5747             src_x, src_y, dst_x, dst_y, width, height);
5748     }
5749
5750     return TRUE;
5751 }
5752
5753 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5754 __attribute__((__force_align_arg_pointer__))
5755 #endif
5756 static pixman_bool_t
5757 sse2_fill (pixman_implementation_t *imp,
5758            uint32_t *               bits,
5759            int                      stride,
5760            int                      bpp,
5761            int                      x,
5762            int                      y,
5763            int                      width,
5764            int                      height,
5765            uint32_t xor)
5766 {
5767     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5768     {
5769         return _pixman_implementation_fill (
5770             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5771     }
5772
5773     return TRUE;
5774 }
5775
5776 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5777 __attribute__((__force_align_arg_pointer__))
5778 #endif
5779 pixman_implementation_t *
5780 _pixman_implementation_create_sse2 (void)
5781 {
5782     pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5783     pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5784
5785     /* SSE2 constants */
5786     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5787     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5788     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5789     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5790     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5791     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5792     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5793     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5794     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
5795     mask_0080 = create_mask_16_128 (0x0080);
5796     mask_00ff = create_mask_16_128 (0x00ff);
5797     mask_0101 = create_mask_16_128 (0x0101);
5798     mask_ffff = create_mask_16_128 (0xffff);
5799     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5800     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5801
5802     /* MMX constants */
5803     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5804     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5805
5806     mask_x0080 = create_mask_16_64 (0x0080);
5807     mask_x00ff = create_mask_16_64 (0x00ff);
5808     mask_x0101 = create_mask_16_64 (0x0101);
5809     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5810
5811     _mm_empty ();
5812
5813     /* Set up function pointers */
5814
5815     /* SSE code patch for fbcompose.c */
5816     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5817     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5818     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5819     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5820     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5821     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5822     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5823     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5824     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5825     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5826
5827     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5828
5829     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5830     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5831     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5832     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5833     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5834     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5835     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5836     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5837     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5838     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5839     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5840
5841     imp->composite = sse2_composite;
5842     imp->blt = sse2_blt;
5843     imp->fill = sse2_fill;
5844
5845     return imp;
5846 }
5847
5848 #endif /* USE_SSE2 */