[sse2] Don't emit prefetch 0 for an absent mask
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41  * the pixman-x64-mmx-emulation.h file contains
42  * implementations of those MMX intrinsics that
43  * are used in the SSE2 implementation.
44  */
45 #   include "pixman-x64-mmx-emulation.h"
46 #endif
47
48 #ifdef USE_SSE2
49
50 /* --------------------------------------------------------------------
51  * Locals
52  */
53
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
58
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
61
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
68
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
75
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
78
79 /* ----------------------------------------------------------------------
80  * SSE2 Inlines
81  */
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
84 {
85     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
86 }
87
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
90 {
91     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
93 }
94
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
97 {
98     __m128i r, g, b, rb, t;
99
100     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
103
104     rb = _mm_or_si128 (r, b);
105     t  = _mm_and_si128 (rb, mask_565_fix_rb);
106     t  = _mm_srli_epi32 (t, 5);
107     rb = _mm_or_si128 (rb, t);
108
109     t  = _mm_and_si128 (g, mask_565_fix_g);
110     t  = _mm_srli_epi32 (t, 6);
111     g  = _mm_or_si128 (g, t);
112
113     return _mm_or_si128 (rb, g);
114 }
115
116 static force_inline void
117 unpack_565_128_4x128 (__m128i  data,
118                       __m128i* data0,
119                       __m128i* data1,
120                       __m128i* data2,
121                       __m128i* data3)
122 {
123     __m128i lo, hi;
124
125     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
127
128     lo = unpack_565_to_8888 (lo);
129     hi = unpack_565_to_8888 (hi);
130
131     unpack_128_2x128 (lo, data0, data1);
132     unpack_128_2x128 (hi, data2, data3);
133 }
134
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
137 {
138     return (uint16_t) (((pixel >> 8) & 0xf800) |
139                        ((pixel >> 5) & 0x07e0) |
140                        ((pixel >> 3) & 0x001f));
141 }
142
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
145 {
146     return _mm_packus_epi16 (lo, hi);
147 }
148
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
151 {
152     __m128i data;
153     __m128i r, g1, g2, b;
154
155     data = pack_2x128_128 (lo, hi);
156
157     r  = _mm_and_si128 (data, mask_565_r);
158     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
161
162     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
163 }
164
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
167 {
168     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169                              pack_565_2x128_128 (*xmm2, *xmm3));
170 }
171
172 static force_inline int
173 is_opaque (__m128i x)
174 {
175     __m128i ffs = _mm_cmpeq_epi8 (x, x);
176
177     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
178 }
179
180 static force_inline int
181 is_zero (__m128i x)
182 {
183     return _mm_movemask_epi8 (
184         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
185 }
186
187 static force_inline int
188 is_transparent (__m128i x)
189 {
190     return (_mm_movemask_epi8 (
191                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
192 }
193
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
196 {
197     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
198 }
199
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
202 {
203     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204                                                      _MM_SHUFFLE (3, 3, 3, 3)),
205                                 _MM_SHUFFLE (3, 3, 3, 3));
206 }
207
208 static force_inline void
209 expand_alpha_2x128 (__m128i  data_lo,
210                     __m128i  data_hi,
211                     __m128i* alpha_lo,
212                     __m128i* alpha_hi)
213 {
214     __m128i lo, hi;
215
216     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
218
219     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
221 }
222
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i  data_lo,
225                         __m128i  data_hi,
226                         __m128i* alpha_lo,
227                         __m128i* alpha_hi)
228 {
229     __m128i lo, hi;
230
231     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
235 }
236
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
239                     __m128i* data_hi,
240                     __m128i* alpha_lo,
241                     __m128i* alpha_hi,
242                     __m128i* ret_lo,
243                     __m128i* ret_hi)
244 {
245     __m128i lo, hi;
246
247     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249     lo = _mm_adds_epu16 (lo, mask_0080);
250     hi = _mm_adds_epu16 (hi, mask_0080);
251     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
253 }
254
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
257                         __m128i* src_hi,
258                         __m128i* alpha_dst_lo,
259                         __m128i* alpha_dst_hi,
260                         __m128i* dst_lo,
261                         __m128i* dst_hi,
262                         __m128i* alpha_src_lo,
263                         __m128i* alpha_src_hi,
264                         __m128i* ret_lo,
265                         __m128i* ret_hi)
266 {
267     __m128i t1_lo, t1_hi;
268     __m128i t2_lo, t2_hi;
269
270     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
272
273     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
275 }
276
277 static force_inline void
278 negate_2x128 (__m128i  data_lo,
279               __m128i  data_hi,
280               __m128i* neg_lo,
281               __m128i* neg_hi)
282 {
283     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
285 }
286
287 static force_inline void
288 invert_colors_2x128 (__m128i  data_lo,
289                      __m128i  data_hi,
290                      __m128i* inv_lo,
291                      __m128i* inv_hi)
292 {
293     __m128i lo, hi;
294
295     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
299 }
300
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
303             __m128i* src_hi,
304             __m128i* alpha_lo,
305             __m128i* alpha_hi,
306             __m128i* dst_lo,
307             __m128i* dst_hi)
308 {
309     __m128i t1, t2;
310
311     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
312
313     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
314
315     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
317 }
318
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i  src_lo,
321                         __m128i  src_hi,
322                         __m128i* dst_lo,
323                         __m128i* dst_hi)
324 {
325     __m128i lo, hi;
326     __m128i alpha_lo, alpha_hi;
327
328     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
329
330     lo = _mm_or_si128 (alpha_lo, mask_alpha);
331     hi = _mm_or_si128 (alpha_hi, mask_alpha);
332
333     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
334
335     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
336
337     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
338 }
339
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
342                __m128i* src_hi,
343                __m128i* alpha_lo,
344                __m128i* alpha_hi,
345                __m128i* mask_lo,
346                __m128i* mask_hi,
347                __m128i* dst_lo,
348                __m128i* dst_hi)
349 {
350     __m128i s_lo, s_hi;
351     __m128i a_lo, a_hi;
352
353     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
355
356     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
357 }
358
359 static force_inline void
360 cache_prefetch (__m128i* addr)
361 {
362     _mm_prefetch ((void const*)addr, _MM_HINT_T0);
363 }
364
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
367 {
368     _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
369 }
370
371 /* load 4 pixels from a 16-byte boundary aligned address */
372 static force_inline __m128i
373 load_128_aligned (__m128i* src)
374 {
375     return _mm_load_si128 (src);
376 }
377
378 /* load 4 pixels from a unaligned address */
379 static force_inline __m128i
380 load_128_unaligned (const __m128i* src)
381 {
382     return _mm_loadu_si128 (src);
383 }
384
385 /* save 4 pixels using Write Combining memory on a 16-byte
386  * boundary aligned address
387  */
388 static force_inline void
389 save_128_write_combining (__m128i* dst,
390                           __m128i  data)
391 {
392     _mm_stream_si128 (dst, data);
393 }
394
395 /* save 4 pixels on a 16-byte boundary aligned address */
396 static force_inline void
397 save_128_aligned (__m128i* dst,
398                   __m128i  data)
399 {
400     _mm_store_si128 (dst, data);
401 }
402
403 /* save 4 pixels on a unaligned address */
404 static force_inline void
405 save_128_unaligned (__m128i* dst,
406                     __m128i  data)
407 {
408     _mm_storeu_si128 (dst, data);
409 }
410
411 /* ------------------------------------------------------------------
412  * MMX inlines
413  */
414
415 static force_inline __m64
416 unpack_32_1x64 (uint32_t data)
417 {
418     return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
419 }
420
421 static force_inline __m64
422 expand_alpha_1x64 (__m64 data)
423 {
424     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
425 }
426
427 static force_inline __m64
428 expand_alpha_rev_1x64 (__m64 data)
429 {
430     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
431 }
432
433 static force_inline __m64
434 expand_pixel_8_1x64 (uint8_t data)
435 {
436     return _mm_shuffle_pi16 (
437         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
438 }
439
440 static force_inline __m64
441 pix_multiply_1x64 (__m64 data,
442                    __m64 alpha)
443 {
444     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
445                                           mask_x0080),
446                            mask_x0101);
447 }
448
449 static force_inline __m64
450 pix_add_multiply_1x64 (__m64* src,
451                        __m64* alpha_dst,
452                        __m64* dst,
453                        __m64* alpha_src)
454 {
455     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
456     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
457
458     return _mm_adds_pu8 (t1, t2);
459 }
460
461 static force_inline __m64
462 negate_1x64 (__m64 data)
463 {
464     return _mm_xor_si64 (data, mask_x00ff);
465 }
466
467 static force_inline __m64
468 invert_colors_1x64 (__m64 data)
469 {
470     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
471 }
472
473 static force_inline __m64
474 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
475 {
476     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
477 }
478
479 static force_inline __m64
480 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
481 {
482     return over_1x64 (pix_multiply_1x64 (*src, *mask),
483                       pix_multiply_1x64 (*alpha, *mask),
484                       *dst);
485 }
486
487 static force_inline __m64
488 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
489 {
490     __m64 alpha = expand_alpha_1x64 (src);
491
492     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
493                                          _mm_or_si64 (alpha, mask_x_alpha)),
494                       alpha,
495                       dst);
496 }
497
498 static force_inline uint32_t
499 pack_1x64_32 (__m64 data)
500 {
501     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
502 }
503
504 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
505  *
506  *    00RR00GG00BB
507  *
508  * --- Expanding 565 in the low word ---
509  *
510  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
511  * m = m & (01f0003f001f);
512  * m = m * (008404100840);
513  * m = m >> 8;
514  *
515  * Note the trick here - the top word is shifted by another nibble to
516  * avoid it bumping into the middle word
517  */
518 static force_inline __m64
519 expand565_16_1x64 (uint16_t pixel)
520 {
521     __m64 p;
522     __m64 t1, t2;
523
524     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
525
526     t1 = _mm_slli_si64 (p, 36 - 11);
527     t2 = _mm_slli_si64 (p, 16 - 5);
528
529     p = _mm_or_si64 (t1, p);
530     p = _mm_or_si64 (t2, p);
531     p = _mm_and_si64 (p, mask_x565_rgb);
532     p = _mm_mullo_pi16 (p, mask_x565_unpack);
533
534     return _mm_srli_pi16 (p, 8);
535 }
536
537 /* ----------------------------------------------------------------------------
538  * Compose Core transformations
539  */
540 static force_inline uint32_t
541 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
542 {
543     uint8_t a;
544     __m64 ms;
545
546     a = src >> 24;
547
548     if (a == 0xff)
549     {
550         return src;
551     }
552     else if (src)
553     {
554         ms = unpack_32_1x64 (src);
555         return pack_1x64_32 (
556             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
557     }
558
559     return dst;
560 }
561
562 static force_inline uint32_t
563 combine1 (const uint32_t *ps, const uint32_t *pm)
564 {
565     uint32_t s = *ps;
566
567     if (pm)
568     {
569         __m64 ms, mm;
570
571         mm = unpack_32_1x64 (*pm);
572         mm = expand_alpha_1x64 (mm);
573
574         ms = unpack_32_1x64 (s);
575         ms = pix_multiply_1x64 (ms, mm);
576
577         s = pack_1x64_32 (ms);
578     }
579
580     return s;
581 }
582
583 static force_inline __m128i
584 combine4 (const __m128i *ps, const __m128i *pm)
585 {
586     __m128i xmm_src_lo, xmm_src_hi;
587     __m128i xmm_msk_lo, xmm_msk_hi;
588     __m128i s;
589
590     if (pm)
591     {
592         xmm_msk_lo = load_128_unaligned (pm);
593
594         if (is_transparent (xmm_msk_lo))
595             return _mm_setzero_si128 ();
596     }
597
598     s = load_128_unaligned (ps);
599
600     if (pm)
601     {
602         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
603         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
604
605         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
606
607         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
608                             &xmm_msk_lo, &xmm_msk_hi,
609                             &xmm_src_lo, &xmm_src_hi);
610
611         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
612     }
613
614     return s;
615 }
616
617 static force_inline void
618 core_combine_over_u_sse2 (uint32_t*       pd,
619                           const uint32_t* ps,
620                           const uint32_t* pm,
621                           int             w)
622 {
623     uint32_t s, d;
624
625     __m128i xmm_dst_lo, xmm_dst_hi;
626     __m128i xmm_src_lo, xmm_src_hi;
627     __m128i xmm_alpha_lo, xmm_alpha_hi;
628
629     /* call prefetch hint to optimize cache load*/
630     cache_prefetch ((__m128i*)ps);
631     cache_prefetch ((__m128i*)pd);
632     if (pm)
633         cache_prefetch ((__m128i*)pm);
634
635     /* Align dst on a 16-byte boundary */
636     while (w && ((unsigned long)pd & 15))
637     {
638         d = *pd;
639         s = combine1 (ps, pm);
640
641         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
642         ps++;
643         if (pm)
644             pm++;
645         w--;
646     }
647
648     /* call prefetch hint to optimize cache load*/
649     cache_prefetch ((__m128i*)ps);
650     cache_prefetch ((__m128i*)pd);
651     if (pm)
652         cache_prefetch ((__m128i*)pm);
653
654     while (w >= 4)
655     {
656         /* fill cache line with next memory */
657         cache_prefetch_next ((__m128i*)ps);
658         cache_prefetch_next ((__m128i*)pd);
659         if (pm)
660             cache_prefetch_next ((__m128i*)pm);
661
662         /* I'm loading unaligned because I'm not sure about
663          * the address alignment.
664          */
665         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
666
667         if (is_opaque (xmm_src_hi))
668         {
669             save_128_aligned ((__m128i*)pd, xmm_src_hi);
670         }
671         else if (!is_zero (xmm_src_hi))
672         {
673             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
674
675             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
676             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
677
678             expand_alpha_2x128 (
679                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
680
681             over_2x128 (&xmm_src_lo, &xmm_src_hi,
682                         &xmm_alpha_lo, &xmm_alpha_hi,
683                         &xmm_dst_lo, &xmm_dst_hi);
684
685             /* rebuid the 4 pixel data and save*/
686             save_128_aligned ((__m128i*)pd,
687                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
688         }
689
690         w -= 4;
691         ps += 4;
692         pd += 4;
693         if (pm)
694             pm += 4;
695     }
696
697     while (w)
698     {
699         d = *pd;
700         s = combine1 (ps, pm);
701
702         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
703         ps++;
704         if (pm)
705             pm++;
706
707         w--;
708     }
709 }
710
711 static force_inline void
712 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
713                                   const uint32_t* ps,
714                                   const uint32_t* pm,
715                                   int             w)
716 {
717     uint32_t s, d;
718
719     __m128i xmm_dst_lo, xmm_dst_hi;
720     __m128i xmm_src_lo, xmm_src_hi;
721     __m128i xmm_alpha_lo, xmm_alpha_hi;
722
723     /* call prefetch hint to optimize cache load*/
724     cache_prefetch ((__m128i*)ps);
725     cache_prefetch ((__m128i*)pd);
726     cache_prefetch ((__m128i*)pm);
727
728     /* Align dst on a 16-byte boundary */
729     while (w &&
730            ((unsigned long)pd & 15))
731     {
732         d = *pd;
733         s = combine1 (ps, pm);
734
735         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
736         w--;
737         ps++;
738         if (pm)
739             pm++;
740     }
741
742     /* call prefetch hint to optimize cache load*/
743     cache_prefetch ((__m128i*)ps);
744     cache_prefetch ((__m128i*)pd);
745     cache_prefetch ((__m128i*)pm);
746
747     while (w >= 4)
748     {
749         /* fill cache line with next memory */
750         cache_prefetch_next ((__m128i*)ps);
751         cache_prefetch_next ((__m128i*)pd);
752         cache_prefetch_next ((__m128i*)pm);
753
754         /* I'm loading unaligned because I'm not sure
755          * about the address alignment.
756          */
757         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
758         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
759
760         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
761         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
762
763         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
764                             &xmm_alpha_lo, &xmm_alpha_hi);
765
766         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
767                     &xmm_alpha_lo, &xmm_alpha_hi,
768                     &xmm_src_lo, &xmm_src_hi);
769
770         /* rebuid the 4 pixel data and save*/
771         save_128_aligned ((__m128i*)pd,
772                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
773
774         w -= 4;
775         ps += 4;
776         pd += 4;
777
778         if (pm)
779             pm += 4;
780     }
781
782     while (w)
783     {
784         d = *pd;
785         s = combine1 (ps, pm);
786
787         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
788         ps++;
789         w--;
790         if (pm)
791             pm++;
792     }
793 }
794
795 static force_inline uint32_t
796 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
797 {
798     uint32_t maska = src >> 24;
799
800     if (maska == 0)
801     {
802         return 0;
803     }
804     else if (maska != 0xff)
805     {
806         return pack_1x64_32 (
807             pix_multiply_1x64 (unpack_32_1x64 (dst),
808                                expand_alpha_1x64 (unpack_32_1x64 (src))));
809     }
810
811     return dst;
812 }
813
814 static force_inline void
815 core_combine_in_u_sse2 (uint32_t*       pd,
816                         const uint32_t* ps,
817                         const uint32_t* pm,
818                         int             w)
819 {
820     uint32_t s, d;
821
822     __m128i xmm_src_lo, xmm_src_hi;
823     __m128i xmm_dst_lo, xmm_dst_hi;
824
825     /* call prefetch hint to optimize cache load*/
826     cache_prefetch ((__m128i*)ps);
827     cache_prefetch ((__m128i*)pd);
828     cache_prefetch ((__m128i*)pm);
829
830     while (w && ((unsigned long) pd & 15))
831     {
832         s = combine1 (ps, pm);
833         d = *pd;
834
835         *pd++ = core_combine_in_u_pixelsse2 (d, s);
836         w--;
837         ps++;
838         if (pm)
839             pm++;
840     }
841
842     /* call prefetch hint to optimize cache load*/
843     cache_prefetch ((__m128i*)ps);
844     cache_prefetch ((__m128i*)pd);
845     cache_prefetch ((__m128i*)pm);
846
847     while (w >= 4)
848     {
849         /* fill cache line with next memory */
850         cache_prefetch_next ((__m128i*)ps);
851         cache_prefetch_next ((__m128i*)pd);
852         cache_prefetch_next ((__m128i*)pm);
853
854         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
855         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
856
857         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
858         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
859
860         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
861         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
862                             &xmm_dst_lo, &xmm_dst_hi,
863                             &xmm_dst_lo, &xmm_dst_hi);
864
865         save_128_aligned ((__m128i*)pd,
866                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
867
868         ps += 4;
869         pd += 4;
870         w -= 4;
871         if (pm)
872             pm += 4;
873     }
874
875     while (w)
876     {
877         s = combine1 (ps, pm);
878         d = *pd;
879
880         *pd++ = core_combine_in_u_pixelsse2 (d, s);
881         w--;
882         ps++;
883         if (pm)
884             pm++;
885     }
886 }
887
888 static force_inline void
889 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
890                                 const uint32_t* ps,
891                                 const uint32_t *pm,
892                                 int             w)
893 {
894     uint32_t s, d;
895
896     __m128i xmm_src_lo, xmm_src_hi;
897     __m128i xmm_dst_lo, xmm_dst_hi;
898
899     /* call prefetch hint to optimize cache load*/
900     cache_prefetch ((__m128i*)ps);
901     cache_prefetch ((__m128i*)pd);
902     cache_prefetch ((__m128i*)pm);
903
904     while (w && ((unsigned long) pd & 15))
905     {
906         s = combine1 (ps, pm);
907         d = *pd;
908
909         *pd++ = core_combine_in_u_pixelsse2 (s, d);
910         ps++;
911         w--;
912         if (pm)
913             pm++;
914     }
915
916     /* call prefetch hint to optimize cache load*/
917     cache_prefetch ((__m128i*)ps);
918     cache_prefetch ((__m128i*)pd);
919     cache_prefetch ((__m128i*)pm);
920
921     while (w >= 4)
922     {
923         /* fill cache line with next memory */
924         cache_prefetch_next ((__m128i*)ps);
925         cache_prefetch_next ((__m128i*)pd);
926         cache_prefetch_next ((__m128i*)pm);
927
928         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
929         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
930
931         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
932         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
933
934         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
935         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
936                             &xmm_src_lo, &xmm_src_hi,
937                             &xmm_dst_lo, &xmm_dst_hi);
938
939         save_128_aligned (
940             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
941
942         ps += 4;
943         pd += 4;
944         w -= 4;
945         if (pm)
946             pm += 4;
947     }
948
949     while (w)
950     {
951         s = combine1 (ps, pm);
952         d = *pd;
953
954         *pd++ = core_combine_in_u_pixelsse2 (s, d);
955         w--;
956         ps++;
957         if (pm)
958             pm++;
959     }
960 }
961
962 static force_inline void
963 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
964                                  const uint32_t* ps,
965                                  const uint32_t* pm,
966                                  int             w)
967 {
968     /* call prefetch hint to optimize cache load*/
969     cache_prefetch ((__m128i*)ps);
970     cache_prefetch ((__m128i*)pd);
971     cache_prefetch ((__m128i*)pm);
972
973     while (w && ((unsigned long) pd & 15))
974     {
975         uint32_t s = combine1 (ps, pm);
976         uint32_t d = *pd;
977
978         *pd++ = pack_1x64_32 (
979             pix_multiply_1x64 (
980                 unpack_32_1x64 (d), negate_1x64 (
981                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
982         
983         if (pm)
984             pm++;
985         ps++;
986         w--;
987     }
988
989     /* call prefetch hint to optimize cache load*/
990     cache_prefetch ((__m128i*)ps);
991     cache_prefetch ((__m128i*)pd);
992     cache_prefetch ((__m128i*)pm);
993
994     while (w >= 4)
995     {
996         __m128i xmm_src_lo, xmm_src_hi;
997         __m128i xmm_dst_lo, xmm_dst_hi;
998
999         /* fill cache line with next memory */
1000         cache_prefetch_next ((__m128i*)ps);
1001         cache_prefetch_next ((__m128i*)pd);
1002         cache_prefetch_next ((__m128i*)pm);
1003
1004         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1005         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1006
1007         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1008         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1009
1010         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1011         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1012
1013         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1014                             &xmm_src_lo, &xmm_src_hi,
1015                             &xmm_dst_lo, &xmm_dst_hi);
1016
1017         save_128_aligned (
1018             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1019
1020         ps += 4;
1021         pd += 4;
1022         if (pm)
1023             pm += 4;
1024
1025         w -= 4;
1026     }
1027
1028     while (w)
1029     {
1030         uint32_t s = combine1 (ps, pm);
1031         uint32_t d = *pd;
1032
1033         *pd++ = pack_1x64_32 (
1034             pix_multiply_1x64 (
1035                 unpack_32_1x64 (d), negate_1x64 (
1036                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1037         ps++;
1038         if (pm)
1039             pm++;
1040         w--;
1041     }
1042 }
1043
1044 static force_inline void
1045 core_combine_out_u_sse2 (uint32_t*       pd,
1046                          const uint32_t* ps,
1047                          const uint32_t* pm,
1048                          int             w)
1049 {
1050     /* call prefetch hint to optimize cache load*/
1051     cache_prefetch ((__m128i*)ps);
1052     cache_prefetch ((__m128i*)pd);
1053     cache_prefetch ((__m128i*)pm);
1054
1055     while (w && ((unsigned long) pd & 15))
1056     {
1057         uint32_t s = combine1 (ps, pm);
1058         uint32_t d = *pd;
1059
1060         *pd++ = pack_1x64_32 (
1061             pix_multiply_1x64 (
1062                 unpack_32_1x64 (s), negate_1x64 (
1063                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1064         w--;
1065         ps++;
1066         if (pm)
1067             pm++;
1068     }
1069
1070     /* call prefetch hint to optimize cache load*/
1071     cache_prefetch ((__m128i*)ps);
1072     cache_prefetch ((__m128i*)pd);
1073     cache_prefetch ((__m128i*)pm);
1074
1075     while (w >= 4)
1076     {
1077         __m128i xmm_src_lo, xmm_src_hi;
1078         __m128i xmm_dst_lo, xmm_dst_hi;
1079
1080         /* fill cache line with next memory */
1081         cache_prefetch_next ((__m128i*)ps);
1082         cache_prefetch_next ((__m128i*)pd);
1083         cache_prefetch_next ((__m128i*)pm);
1084
1085         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1086         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1087
1088         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1089         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1090
1091         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1092         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1093
1094         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1095                             &xmm_dst_lo, &xmm_dst_hi,
1096                             &xmm_dst_lo, &xmm_dst_hi);
1097
1098         save_128_aligned (
1099             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1100
1101         ps += 4;
1102         pd += 4;
1103         w -= 4;
1104         if (pm)
1105             pm += 4;
1106     }
1107
1108     while (w)
1109     {
1110         uint32_t s = combine1 (ps, pm);
1111         uint32_t d = *pd;
1112
1113         *pd++ = pack_1x64_32 (
1114             pix_multiply_1x64 (
1115                 unpack_32_1x64 (s), negate_1x64 (
1116                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1117         w--;
1118         ps++;
1119         if (pm)
1120             pm++;
1121     }
1122 }
1123
1124 static force_inline uint32_t
1125 core_combine_atop_u_pixel_sse2 (uint32_t src,
1126                                 uint32_t dst)
1127 {
1128     __m64 s = unpack_32_1x64 (src);
1129     __m64 d = unpack_32_1x64 (dst);
1130
1131     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1132     __m64 da = expand_alpha_1x64 (d);
1133
1134     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1135 }
1136
1137 static force_inline void
1138 core_combine_atop_u_sse2 (uint32_t*       pd,
1139                           const uint32_t* ps,
1140                           const uint32_t* pm,
1141                           int             w)
1142 {
1143     uint32_t s, d;
1144
1145     __m128i xmm_src_lo, xmm_src_hi;
1146     __m128i xmm_dst_lo, xmm_dst_hi;
1147     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1148     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1149
1150     /* call prefetch hint to optimize cache load*/
1151     cache_prefetch ((__m128i*)ps);
1152     cache_prefetch ((__m128i*)pd);
1153     cache_prefetch ((__m128i*)pm);
1154
1155     while (w && ((unsigned long) pd & 15))
1156     {
1157         s = combine1 (ps, pm);
1158         d = *pd;
1159
1160         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1161         w--;
1162         ps++;
1163         if (pm)
1164             pm++;
1165     }
1166
1167     /* call prefetch hint to optimize cache load*/
1168     cache_prefetch ((__m128i*)ps);
1169     cache_prefetch ((__m128i*)pd);
1170     cache_prefetch ((__m128i*)pm);
1171
1172     while (w >= 4)
1173     {
1174         /* fill cache line with next memory */
1175         cache_prefetch_next ((__m128i*)ps);
1176         cache_prefetch_next ((__m128i*)pd);
1177         cache_prefetch_next ((__m128i*)pm);
1178
1179         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1180         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1181
1182         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1183         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1184
1185         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1186                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1187         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1188                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1189
1190         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1191                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1192
1193         pix_add_multiply_2x128 (
1194             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1195             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1196             &xmm_dst_lo, &xmm_dst_hi);
1197
1198         save_128_aligned (
1199             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1200
1201         ps += 4;
1202         pd += 4;
1203         w -= 4;
1204         if (pm)
1205             pm += 4;
1206     }
1207
1208     while (w)
1209     {
1210         s = combine1 (ps, pm);
1211         d = *pd;
1212
1213         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1214         w--;
1215         ps++;
1216         if (pm)
1217             pm++;
1218     }
1219 }
1220
1221 static force_inline uint32_t
1222 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1223                                         uint32_t dst)
1224 {
1225     __m64 s = unpack_32_1x64 (src);
1226     __m64 d = unpack_32_1x64 (dst);
1227
1228     __m64 sa = expand_alpha_1x64 (s);
1229     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1230
1231     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1232 }
1233
1234 static force_inline void
1235 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1236                                   const uint32_t* ps,
1237                                   const uint32_t* pm,
1238                                   int             w)
1239 {
1240     uint32_t s, d;
1241
1242     __m128i xmm_src_lo, xmm_src_hi;
1243     __m128i xmm_dst_lo, xmm_dst_hi;
1244     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1245     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1246
1247     /* call prefetch hint to optimize cache load*/
1248     cache_prefetch ((__m128i*)ps);
1249     cache_prefetch ((__m128i*)pd);
1250     cache_prefetch ((__m128i*)pm);
1251
1252     while (w && ((unsigned long) pd & 15))
1253     {
1254         s = combine1 (ps, pm);
1255         d = *pd;
1256
1257         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1258         ps++;
1259         w--;
1260         if (pm)
1261             pm++;
1262     }
1263
1264     /* call prefetch hint to optimize cache load*/
1265     cache_prefetch ((__m128i*)ps);
1266     cache_prefetch ((__m128i*)pd);
1267     cache_prefetch ((__m128i*)pm);
1268
1269     while (w >= 4)
1270     {
1271         /* fill cache line with next memory */
1272         cache_prefetch_next ((__m128i*)ps);
1273         cache_prefetch_next ((__m128i*)pd);
1274         cache_prefetch_next ((__m128i*)pm);
1275
1276         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1277         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1278
1279         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1280         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1281
1282         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1283                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1284         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1285                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1286
1287         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1288                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1289
1290         pix_add_multiply_2x128 (
1291             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1292             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1293             &xmm_dst_lo, &xmm_dst_hi);
1294
1295         save_128_aligned (
1296             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1297
1298         ps += 4;
1299         pd += 4;
1300         w -= 4;
1301         if (pm)
1302             pm += 4;
1303     }
1304
1305     while (w)
1306     {
1307         s = combine1 (ps, pm);
1308         d = *pd;
1309
1310         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1311         ps++;
1312         w--;
1313         if (pm)
1314             pm++;
1315     }
1316 }
1317
1318 static force_inline uint32_t
1319 core_combine_xor_u_pixel_sse2 (uint32_t src,
1320                                uint32_t dst)
1321 {
1322     __m64 s = unpack_32_1x64 (src);
1323     __m64 d = unpack_32_1x64 (dst);
1324
1325     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1326     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1327
1328     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1329 }
1330
1331 static force_inline void
1332 core_combine_xor_u_sse2 (uint32_t*       dst,
1333                          const uint32_t* src,
1334                          const uint32_t *mask,
1335                          int             width)
1336 {
1337     int w = width;
1338     uint32_t s, d;
1339     uint32_t* pd = dst;
1340     const uint32_t* ps = src;
1341     const uint32_t* pm = mask;
1342
1343     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1344     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1345     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1346     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1347
1348     /* call prefetch hint to optimize cache load*/
1349     cache_prefetch ((__m128i*)ps);
1350     cache_prefetch ((__m128i*)pd);
1351     cache_prefetch ((__m128i*)pm);
1352
1353     while (w && ((unsigned long) pd & 15))
1354     {
1355         s = combine1 (ps, pm);
1356         d = *pd;
1357
1358         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1359         w--;
1360         ps++;
1361         if (pm)
1362             pm++;
1363     }
1364
1365     /* call prefetch hint to optimize cache load*/
1366     cache_prefetch ((__m128i*)ps);
1367     cache_prefetch ((__m128i*)pd);
1368     cache_prefetch ((__m128i*)pm);
1369
1370     while (w >= 4)
1371     {
1372         /* fill cache line with next memory */
1373         cache_prefetch_next ((__m128i*)ps);
1374         cache_prefetch_next ((__m128i*)pd);
1375         cache_prefetch_next ((__m128i*)pm);
1376
1377         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1378         xmm_dst = load_128_aligned ((__m128i*) pd);
1379
1380         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1381         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1382
1383         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1384                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1385         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1386                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1387
1388         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1389                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1390         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1391                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1392
1393         pix_add_multiply_2x128 (
1394             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1395             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1396             &xmm_dst_lo, &xmm_dst_hi);
1397
1398         save_128_aligned (
1399             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1400
1401         ps += 4;
1402         pd += 4;
1403         w -= 4;
1404         if (pm)
1405             pm += 4;
1406     }
1407
1408     while (w)
1409     {
1410         s = combine1 (ps, pm);
1411         d = *pd;
1412
1413         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1414         w--;
1415         ps++;
1416         if (pm)
1417             pm++;
1418     }
1419 }
1420
1421 static force_inline void
1422 core_combine_add_u_sse2 (uint32_t*       dst,
1423                          const uint32_t* src,
1424                          const uint32_t* mask,
1425                          int             width)
1426 {
1427     int w = width;
1428     uint32_t s, d;
1429     uint32_t* pd = dst;
1430     const uint32_t* ps = src;
1431     const uint32_t* pm = mask;
1432
1433     /* call prefetch hint to optimize cache load*/
1434     cache_prefetch ((__m128i*)ps);
1435     cache_prefetch ((__m128i*)pd);
1436     cache_prefetch ((__m128i*)pm);
1437
1438     while (w && (unsigned long)pd & 15)
1439     {
1440         s = combine1 (ps, pm);
1441         d = *pd;
1442
1443         ps++;
1444         if (pm)
1445             pm++;
1446         *pd++ = _mm_cvtsi64_si32 (
1447             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1448         w--;
1449     }
1450
1451     /* call prefetch hint to optimize cache load*/
1452     cache_prefetch ((__m128i*)ps);
1453     cache_prefetch ((__m128i*)pd);
1454     cache_prefetch ((__m128i*)pm);
1455
1456     while (w >= 4)
1457     {
1458         __m128i s;
1459
1460         /* fill cache line with next memory */
1461         cache_prefetch_next ((__m128i*)ps);
1462         cache_prefetch_next ((__m128i*)pd);
1463         cache_prefetch_next ((__m128i*)pm);
1464
1465         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1466
1467         save_128_aligned (
1468             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1469
1470         pd += 4;
1471         ps += 4;
1472         if (pm)
1473             pm += 4;
1474         w -= 4;
1475     }
1476
1477     while (w--)
1478     {
1479         s = combine1 (ps, pm);
1480         d = *pd;
1481
1482         ps++;
1483         *pd++ = _mm_cvtsi64_si32 (
1484             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1485         if (pm)
1486             pm++;
1487     }
1488 }
1489
1490 static force_inline uint32_t
1491 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1492                                     uint32_t dst)
1493 {
1494     __m64 ms = unpack_32_1x64 (src);
1495     __m64 md = unpack_32_1x64 (dst);
1496     uint32_t sa = src >> 24;
1497     uint32_t da = ~dst >> 24;
1498
1499     if (sa > da)
1500     {
1501         ms = pix_multiply_1x64 (
1502             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1503     }
1504
1505     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1506 }
1507
1508 static force_inline void
1509 core_combine_saturate_u_sse2 (uint32_t *      pd,
1510                               const uint32_t *ps,
1511                               const uint32_t *pm,
1512                               int             w)
1513 {
1514     uint32_t s, d;
1515
1516     uint32_t pack_cmp;
1517     __m128i xmm_src, xmm_dst;
1518
1519     /* call prefetch hint to optimize cache load*/
1520     cache_prefetch ((__m128i*)ps);
1521     cache_prefetch ((__m128i*)pd);
1522     cache_prefetch ((__m128i*)pm);
1523
1524     while (w && (unsigned long)pd & 15)
1525     {
1526         s = combine1 (ps, pm);
1527         d = *pd;
1528
1529         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1530         w--;
1531         ps++;
1532         if (pm)
1533             pm++;
1534     }
1535
1536     /* call prefetch hint to optimize cache load*/
1537     cache_prefetch ((__m128i*)ps);
1538     cache_prefetch ((__m128i*)pd);
1539     cache_prefetch ((__m128i*)pm);
1540
1541     while (w >= 4)
1542     {
1543         /* fill cache line with next memory */
1544         cache_prefetch_next ((__m128i*)ps);
1545         cache_prefetch_next ((__m128i*)pd);
1546         cache_prefetch_next ((__m128i*)pm);
1547
1548         xmm_dst = load_128_aligned  ((__m128i*)pd);
1549         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1550
1551         pack_cmp = _mm_movemask_epi8 (
1552             _mm_cmpgt_epi32 (
1553                 _mm_srli_epi32 (xmm_src, 24),
1554                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1555
1556         /* if some alpha src is grater than respective ~alpha dst */
1557         if (pack_cmp)
1558         {
1559             s = combine1 (ps++, pm);
1560             d = *pd;
1561             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1562             if (pm)
1563                 pm++;
1564
1565             s = combine1 (ps++, pm);
1566             d = *pd;
1567             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1568             if (pm)
1569                 pm++;
1570
1571             s = combine1 (ps++, pm);
1572             d = *pd;
1573             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1574             if (pm)
1575                 pm++;
1576
1577             s = combine1 (ps++, pm);
1578             d = *pd;
1579             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1580             if (pm)
1581                 pm++;
1582         }
1583         else
1584         {
1585             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1586
1587             pd += 4;
1588             ps += 4;
1589             if (pm)
1590                 pm += 4;
1591         }
1592
1593         w -= 4;
1594     }
1595
1596     while (w--)
1597     {
1598         s = combine1 (ps, pm);
1599         d = *pd;
1600
1601         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1602         ps++;
1603         if (pm)
1604             pm++;
1605     }
1606 }
1607
1608 static force_inline void
1609 core_combine_src_ca_sse2 (uint32_t*       pd,
1610                           const uint32_t* ps,
1611                           const uint32_t *pm,
1612                           int             w)
1613 {
1614     uint32_t s, m;
1615
1616     __m128i xmm_src_lo, xmm_src_hi;
1617     __m128i xmm_mask_lo, xmm_mask_hi;
1618     __m128i xmm_dst_lo, xmm_dst_hi;
1619
1620     /* call prefetch hint to optimize cache load*/
1621     cache_prefetch ((__m128i*)ps);
1622     cache_prefetch ((__m128i*)pd);
1623     cache_prefetch ((__m128i*)pm);
1624
1625     while (w && (unsigned long)pd & 15)
1626     {
1627         s = *ps++;
1628         m = *pm++;
1629         *pd++ = pack_1x64_32 (
1630             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1631         w--;
1632     }
1633
1634     /* call prefetch hint to optimize cache load*/
1635     cache_prefetch ((__m128i*)ps);
1636     cache_prefetch ((__m128i*)pd);
1637     cache_prefetch ((__m128i*)pm);
1638
1639     while (w >= 4)
1640     {
1641         /* fill cache line with next memory */
1642         cache_prefetch_next ((__m128i*)ps);
1643         cache_prefetch_next ((__m128i*)pd);
1644         cache_prefetch_next ((__m128i*)pm);
1645
1646         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1647         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1648
1649         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1650         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1651
1652         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1653                             &xmm_mask_lo, &xmm_mask_hi,
1654                             &xmm_dst_lo, &xmm_dst_hi);
1655
1656         save_128_aligned (
1657             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1658
1659         ps += 4;
1660         pd += 4;
1661         pm += 4;
1662         w -= 4;
1663     }
1664
1665     while (w)
1666     {
1667         s = *ps++;
1668         m = *pm++;
1669         *pd++ = pack_1x64_32 (
1670             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1671         w--;
1672     }
1673 }
1674
1675 static force_inline uint32_t
1676 core_combine_over_ca_pixel_sse2 (uint32_t src,
1677                                  uint32_t mask,
1678                                  uint32_t dst)
1679 {
1680     __m64 s = unpack_32_1x64 (src);
1681     __m64 expAlpha = expand_alpha_1x64 (s);
1682     __m64 unpk_mask = unpack_32_1x64 (mask);
1683     __m64 unpk_dst  = unpack_32_1x64 (dst);
1684
1685     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1686 }
1687
1688 static force_inline void
1689 core_combine_over_ca_sse2 (uint32_t*       pd,
1690                            const uint32_t* ps,
1691                            const uint32_t *pm,
1692                            int             w)
1693 {
1694     uint32_t s, m, d;
1695
1696     __m128i xmm_alpha_lo, xmm_alpha_hi;
1697     __m128i xmm_src_lo, xmm_src_hi;
1698     __m128i xmm_dst_lo, xmm_dst_hi;
1699     __m128i xmm_mask_lo, xmm_mask_hi;
1700
1701     /* call prefetch hint to optimize cache load*/
1702     cache_prefetch ((__m128i*)ps);
1703     cache_prefetch ((__m128i*)pd);
1704     cache_prefetch ((__m128i*)pm);
1705
1706     while (w && (unsigned long)pd & 15)
1707     {
1708         s = *ps++;
1709         m = *pm++;
1710         d = *pd;
1711
1712         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1713         w--;
1714     }
1715
1716     /* call prefetch hint to optimize cache load*/
1717     cache_prefetch ((__m128i*)ps);
1718     cache_prefetch ((__m128i*)pd);
1719     cache_prefetch ((__m128i*)pm);
1720
1721     while (w >= 4)
1722     {
1723         /* fill cache line with next memory */
1724         cache_prefetch_next ((__m128i*)ps);
1725         cache_prefetch_next ((__m128i*)pd);
1726         cache_prefetch_next ((__m128i*)pm);
1727
1728         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1729         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1730         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1731
1732         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1733         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1734         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1735
1736         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1737                             &xmm_alpha_lo, &xmm_alpha_hi);
1738
1739         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1740                        &xmm_alpha_lo, &xmm_alpha_hi,
1741                        &xmm_mask_lo, &xmm_mask_hi,
1742                        &xmm_dst_lo, &xmm_dst_hi);
1743
1744         save_128_aligned (
1745             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1746
1747         ps += 4;
1748         pd += 4;
1749         pm += 4;
1750         w -= 4;
1751     }
1752
1753     while (w)
1754     {
1755         s = *ps++;
1756         m = *pm++;
1757         d = *pd;
1758
1759         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1760         w--;
1761     }
1762 }
1763
1764 static force_inline uint32_t
1765 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1766                                          uint32_t mask,
1767                                          uint32_t dst)
1768 {
1769     __m64 d = unpack_32_1x64 (dst);
1770
1771     return pack_1x64_32 (
1772         over_1x64 (d, expand_alpha_1x64 (d),
1773                    pix_multiply_1x64 (unpack_32_1x64 (src),
1774                                       unpack_32_1x64 (mask))));
1775 }
1776
1777 static force_inline void
1778 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1779                                    const uint32_t* ps,
1780                                    const uint32_t *pm,
1781                                    int             w)
1782 {
1783     uint32_t s, m, d;
1784
1785     __m128i xmm_alpha_lo, xmm_alpha_hi;
1786     __m128i xmm_src_lo, xmm_src_hi;
1787     __m128i xmm_dst_lo, xmm_dst_hi;
1788     __m128i xmm_mask_lo, xmm_mask_hi;
1789
1790     /* call prefetch hint to optimize cache load*/
1791     cache_prefetch ((__m128i*)ps);
1792     cache_prefetch ((__m128i*)pd);
1793     cache_prefetch ((__m128i*)pm);
1794
1795     while (w && (unsigned long)pd & 15)
1796     {
1797         s = *ps++;
1798         m = *pm++;
1799         d = *pd;
1800
1801         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1802         w--;
1803     }
1804
1805     /* call prefetch hint to optimize cache load*/
1806     cache_prefetch ((__m128i*)ps);
1807     cache_prefetch ((__m128i*)pd);
1808     cache_prefetch ((__m128i*)pm);
1809
1810     while (w >= 4)
1811     {
1812         /* fill cache line with next memory */
1813         cache_prefetch_next ((__m128i*)ps);
1814         cache_prefetch_next ((__m128i*)pd);
1815         cache_prefetch_next ((__m128i*)pm);
1816
1817         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1818         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1819         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1820
1821         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1822         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1823         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1824
1825         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1826                             &xmm_alpha_lo, &xmm_alpha_hi);
1827         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1828                             &xmm_mask_lo, &xmm_mask_hi,
1829                             &xmm_mask_lo, &xmm_mask_hi);
1830
1831         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1832                     &xmm_alpha_lo, &xmm_alpha_hi,
1833                     &xmm_mask_lo, &xmm_mask_hi);
1834
1835         save_128_aligned (
1836             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1837
1838         ps += 4;
1839         pd += 4;
1840         pm += 4;
1841         w -= 4;
1842     }
1843
1844     while (w)
1845     {
1846         s = *ps++;
1847         m = *pm++;
1848         d = *pd;
1849
1850         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1851         w--;
1852     }
1853 }
1854
1855 static force_inline void
1856 core_combine_in_ca_sse2 (uint32_t *      pd,
1857                          const uint32_t *ps,
1858                          const uint32_t *pm,
1859                          int             w)
1860 {
1861     uint32_t s, m, d;
1862
1863     __m128i xmm_alpha_lo, xmm_alpha_hi;
1864     __m128i xmm_src_lo, xmm_src_hi;
1865     __m128i xmm_dst_lo, xmm_dst_hi;
1866     __m128i xmm_mask_lo, xmm_mask_hi;
1867
1868     /* call prefetch hint to optimize cache load*/
1869     cache_prefetch ((__m128i*)ps);
1870     cache_prefetch ((__m128i*)pd);
1871     cache_prefetch ((__m128i*)pm);
1872
1873     while (w && (unsigned long)pd & 15)
1874     {
1875         s = *ps++;
1876         m = *pm++;
1877         d = *pd;
1878
1879         *pd++ = pack_1x64_32 (
1880             pix_multiply_1x64 (
1881                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1882                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1883
1884         w--;
1885     }
1886
1887     /* call prefetch hint to optimize cache load*/
1888     cache_prefetch ((__m128i*)ps);
1889     cache_prefetch ((__m128i*)pd);
1890     cache_prefetch ((__m128i*)pm);
1891
1892     while (w >= 4)
1893     {
1894         /* fill cache line with next memory */
1895         cache_prefetch_next ((__m128i*)ps);
1896         cache_prefetch_next ((__m128i*)pd);
1897         cache_prefetch_next ((__m128i*)pm);
1898
1899         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1900         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1901         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1902
1903         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1904         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1905         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1906
1907         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1908                             &xmm_alpha_lo, &xmm_alpha_hi);
1909
1910         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1911                             &xmm_mask_lo, &xmm_mask_hi,
1912                             &xmm_dst_lo, &xmm_dst_hi);
1913
1914         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1915                             &xmm_alpha_lo, &xmm_alpha_hi,
1916                             &xmm_dst_lo, &xmm_dst_hi);
1917
1918         save_128_aligned (
1919             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1920
1921         ps += 4;
1922         pd += 4;
1923         pm += 4;
1924         w -= 4;
1925     }
1926
1927     while (w)
1928     {
1929         s = *ps++;
1930         m = *pm++;
1931         d = *pd;
1932
1933         *pd++ = pack_1x64_32 (
1934             pix_multiply_1x64 (
1935                 pix_multiply_1x64 (
1936                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1937                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1938
1939         w--;
1940     }
1941 }
1942
1943 static force_inline void
1944 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1945                                  const uint32_t *ps,
1946                                  const uint32_t *pm,
1947                                  int             w)
1948 {
1949     uint32_t s, m, d;
1950
1951     __m128i xmm_alpha_lo, xmm_alpha_hi;
1952     __m128i xmm_src_lo, xmm_src_hi;
1953     __m128i xmm_dst_lo, xmm_dst_hi;
1954     __m128i xmm_mask_lo, xmm_mask_hi;
1955
1956     /* call prefetch hint to optimize cache load*/
1957     cache_prefetch ((__m128i*)ps);
1958     cache_prefetch ((__m128i*)pd);
1959     cache_prefetch ((__m128i*)pm);
1960
1961     while (w && (unsigned long)pd & 15)
1962     {
1963         s = *ps++;
1964         m = *pm++;
1965         d = *pd;
1966
1967         *pd++ = pack_1x64_32 (
1968             pix_multiply_1x64 (
1969                 unpack_32_1x64 (d),
1970                 pix_multiply_1x64 (unpack_32_1x64 (m),
1971                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1972         w--;
1973     }
1974
1975     /* call prefetch hint to optimize cache load*/
1976     cache_prefetch ((__m128i*)ps);
1977     cache_prefetch ((__m128i*)pd);
1978     cache_prefetch ((__m128i*)pm);
1979
1980     while (w >= 4)
1981     {
1982         /* fill cache line with next memory */
1983         cache_prefetch_next ((__m128i*)ps);
1984         cache_prefetch_next ((__m128i*)pd);
1985         cache_prefetch_next ((__m128i*)pm);
1986
1987         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1988         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1989         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1990
1991         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1992         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1993         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1994
1995         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1996                             &xmm_alpha_lo, &xmm_alpha_hi);
1997         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1998                             &xmm_alpha_lo, &xmm_alpha_hi,
1999                             &xmm_alpha_lo, &xmm_alpha_hi);
2000
2001         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2002                             &xmm_alpha_lo, &xmm_alpha_hi,
2003                             &xmm_dst_lo, &xmm_dst_hi);
2004
2005         save_128_aligned (
2006             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2007
2008         ps += 4;
2009         pd += 4;
2010         pm += 4;
2011         w -= 4;
2012     }
2013
2014     while (w)
2015     {
2016         s = *ps++;
2017         m = *pm++;
2018         d = *pd;
2019
2020         *pd++ = pack_1x64_32 (
2021             pix_multiply_1x64 (
2022                 unpack_32_1x64 (d),
2023                 pix_multiply_1x64 (unpack_32_1x64 (m),
2024                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
2025         w--;
2026     }
2027 }
2028
2029 static force_inline void
2030 core_combine_out_ca_sse2 (uint32_t *      pd,
2031                           const uint32_t *ps,
2032                           const uint32_t *pm,
2033                           int             w)
2034 {
2035     uint32_t s, m, d;
2036
2037     __m128i xmm_alpha_lo, xmm_alpha_hi;
2038     __m128i xmm_src_lo, xmm_src_hi;
2039     __m128i xmm_dst_lo, xmm_dst_hi;
2040     __m128i xmm_mask_lo, xmm_mask_hi;
2041
2042     /* call prefetch hint to optimize cache load*/
2043     cache_prefetch ((__m128i*)ps);
2044     cache_prefetch ((__m128i*)pd);
2045     cache_prefetch ((__m128i*)pm);
2046
2047     while (w && (unsigned long)pd & 15)
2048     {
2049         s = *ps++;
2050         m = *pm++;
2051         d = *pd;
2052
2053         *pd++ = pack_1x64_32 (
2054             pix_multiply_1x64 (
2055                 pix_multiply_1x64 (
2056                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2057                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2058         w--;
2059     }
2060
2061     /* call prefetch hint to optimize cache load*/
2062     cache_prefetch ((__m128i*)ps);
2063     cache_prefetch ((__m128i*)pd);
2064     cache_prefetch ((__m128i*)pm);
2065
2066     while (w >= 4)
2067     {
2068         /* fill cache line with next memory */
2069         cache_prefetch_next ((__m128i*)ps);
2070         cache_prefetch_next ((__m128i*)pd);
2071         cache_prefetch_next ((__m128i*)pm);
2072
2073         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2074         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2075         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2076
2077         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2078         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2079         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2080
2081         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2082                             &xmm_alpha_lo, &xmm_alpha_hi);
2083         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2084                       &xmm_alpha_lo, &xmm_alpha_hi);
2085
2086         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2087                             &xmm_mask_lo, &xmm_mask_hi,
2088                             &xmm_dst_lo, &xmm_dst_hi);
2089         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2090                             &xmm_alpha_lo, &xmm_alpha_hi,
2091                             &xmm_dst_lo, &xmm_dst_hi);
2092
2093         save_128_aligned (
2094             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2095
2096         ps += 4;
2097         pd += 4;
2098         pm += 4;
2099         w -= 4;
2100     }
2101
2102     while (w)
2103     {
2104         s = *ps++;
2105         m = *pm++;
2106         d = *pd;
2107
2108         *pd++ = pack_1x64_32 (
2109             pix_multiply_1x64 (
2110                 pix_multiply_1x64 (
2111                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2112                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2113
2114         w--;
2115     }
2116 }
2117
2118 static force_inline void
2119 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
2120                                   const uint32_t *ps,
2121                                   const uint32_t *pm,
2122                                   int             w)
2123 {
2124     uint32_t s, m, d;
2125
2126     __m128i xmm_alpha_lo, xmm_alpha_hi;
2127     __m128i xmm_src_lo, xmm_src_hi;
2128     __m128i xmm_dst_lo, xmm_dst_hi;
2129     __m128i xmm_mask_lo, xmm_mask_hi;
2130
2131     /* call prefetch hint to optimize cache load*/
2132     cache_prefetch ((__m128i*)ps);
2133     cache_prefetch ((__m128i*)pd);
2134     cache_prefetch ((__m128i*)pm);
2135
2136     while (w && (unsigned long)pd & 15)
2137     {
2138         s = *ps++;
2139         m = *pm++;
2140         d = *pd;
2141
2142         *pd++ = pack_1x64_32 (
2143             pix_multiply_1x64 (
2144                 unpack_32_1x64 (d),
2145                 negate_1x64 (pix_multiply_1x64 (
2146                                  unpack_32_1x64 (m),
2147                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2148         w--;
2149     }
2150
2151     /* call prefetch hint to optimize cache load*/
2152     cache_prefetch ((__m128i*)ps);
2153     cache_prefetch ((__m128i*)pd);
2154     cache_prefetch ((__m128i*)pm);
2155
2156     while (w >= 4)
2157     {
2158         /* fill cache line with next memory */
2159         cache_prefetch_next ((__m128i*)ps);
2160         cache_prefetch_next ((__m128i*)pd);
2161         cache_prefetch_next ((__m128i*)pm);
2162
2163         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2164         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2165         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2166
2167         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2168         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2169         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2170
2171         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2172                             &xmm_alpha_lo, &xmm_alpha_hi);
2173
2174         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2175                             &xmm_alpha_lo, &xmm_alpha_hi,
2176                             &xmm_mask_lo, &xmm_mask_hi);
2177
2178         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2179                       &xmm_mask_lo, &xmm_mask_hi);
2180
2181         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2182                             &xmm_mask_lo, &xmm_mask_hi,
2183                             &xmm_dst_lo, &xmm_dst_hi);
2184
2185         save_128_aligned (
2186             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2187
2188         ps += 4;
2189         pd += 4;
2190         pm += 4;
2191         w -= 4;
2192     }
2193
2194     while (w)
2195     {
2196         s = *ps++;
2197         m = *pm++;
2198         d = *pd;
2199
2200         *pd++ = pack_1x64_32 (
2201             pix_multiply_1x64 (
2202                 unpack_32_1x64 (d),
2203                 negate_1x64 (pix_multiply_1x64 (
2204                                  unpack_32_1x64 (m),
2205                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2206         w--;
2207     }
2208 }
2209
2210 static force_inline uint32_t
2211 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2212                                  uint32_t mask,
2213                                  uint32_t dst)
2214 {
2215     __m64 m = unpack_32_1x64 (mask);
2216     __m64 s = unpack_32_1x64 (src);
2217     __m64 d = unpack_32_1x64 (dst);
2218     __m64 sa = expand_alpha_1x64 (s);
2219     __m64 da = expand_alpha_1x64 (d);
2220
2221     s = pix_multiply_1x64 (s, m);
2222     m = negate_1x64 (pix_multiply_1x64 (m, sa));
2223
2224     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2225 }
2226
2227 static force_inline void
2228 core_combine_atop_ca_sse2 (uint32_t *      pd,
2229                            const uint32_t *ps,
2230                            const uint32_t *pm,
2231                            int             w)
2232 {
2233     uint32_t s, m, d;
2234
2235     __m128i xmm_src_lo, xmm_src_hi;
2236     __m128i xmm_dst_lo, xmm_dst_hi;
2237     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2238     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2239     __m128i xmm_mask_lo, xmm_mask_hi;
2240
2241     /* call prefetch hint to optimize cache load*/
2242     cache_prefetch ((__m128i*)ps);
2243     cache_prefetch ((__m128i*)pd);
2244     cache_prefetch ((__m128i*)pm);
2245
2246     while (w && (unsigned long)pd & 15)
2247     {
2248         s = *ps++;
2249         m = *pm++;
2250         d = *pd;
2251
2252         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2253         w--;
2254     }
2255
2256     /* call prefetch hint to optimize cache load*/
2257     cache_prefetch ((__m128i*)ps);
2258     cache_prefetch ((__m128i*)pd);
2259     cache_prefetch ((__m128i*)pm);
2260
2261     while (w >= 4)
2262     {
2263         /* fill cache line with next memory */
2264         cache_prefetch_next ((__m128i*)ps);
2265         cache_prefetch_next ((__m128i*)pd);
2266         cache_prefetch_next ((__m128i*)pm);
2267
2268         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2269         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2270         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2271
2272         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2273         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2274         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2275
2276         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2277                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2278         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2279                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2280
2281         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2282                             &xmm_mask_lo, &xmm_mask_hi,
2283                             &xmm_src_lo, &xmm_src_hi);
2284         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2285                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2286                             &xmm_mask_lo, &xmm_mask_hi);
2287
2288         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2289
2290         pix_add_multiply_2x128 (
2291             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2292             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2293             &xmm_dst_lo, &xmm_dst_hi);
2294
2295         save_128_aligned (
2296             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2297
2298         ps += 4;
2299         pd += 4;
2300         pm += 4;
2301         w -= 4;
2302     }
2303
2304     while (w)
2305     {
2306         s = *ps++;
2307         m = *pm++;
2308         d = *pd;
2309
2310         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2311         w--;
2312     }
2313 }
2314
2315 static force_inline uint32_t
2316 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2317                                          uint32_t mask,
2318                                          uint32_t dst)
2319 {
2320     __m64 m = unpack_32_1x64 (mask);
2321     __m64 s = unpack_32_1x64 (src);
2322     __m64 d = unpack_32_1x64 (dst);
2323
2324     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2325     __m64 sa = expand_alpha_1x64 (s);
2326
2327     s = pix_multiply_1x64 (s, m);
2328     m = pix_multiply_1x64 (m, sa);
2329
2330     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2331 }
2332
2333 static force_inline void
2334 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2335                                    const uint32_t *ps,
2336                                    const uint32_t *pm,
2337                                    int             w)
2338 {
2339     uint32_t s, m, d;
2340
2341     __m128i xmm_src_lo, xmm_src_hi;
2342     __m128i xmm_dst_lo, xmm_dst_hi;
2343     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2344     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2345     __m128i xmm_mask_lo, xmm_mask_hi;
2346
2347     /* call prefetch hint to optimize cache load*/
2348     cache_prefetch ((__m128i*)ps);
2349     cache_prefetch ((__m128i*)pd);
2350     cache_prefetch ((__m128i*)pm);
2351
2352     while (w && (unsigned long)pd & 15)
2353     {
2354         s = *ps++;
2355         m = *pm++;
2356         d = *pd;
2357
2358         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2359         w--;
2360     }
2361
2362     /* call prefetch hint to optimize cache load*/
2363     cache_prefetch ((__m128i*)ps);
2364     cache_prefetch ((__m128i*)pd);
2365     cache_prefetch ((__m128i*)pm);
2366
2367     while (w >= 4)
2368     {
2369         /* fill cache line with next memory */
2370         cache_prefetch_next ((__m128i*)ps);
2371         cache_prefetch_next ((__m128i*)pd);
2372         cache_prefetch_next ((__m128i*)pm);
2373
2374         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2375         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2376         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2377
2378         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2379         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2380         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2381
2382         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2383                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2384         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2385                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2386
2387         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2388                             &xmm_mask_lo, &xmm_mask_hi,
2389                             &xmm_src_lo, &xmm_src_hi);
2390         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2391                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2392                             &xmm_mask_lo, &xmm_mask_hi);
2393
2394         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2395                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2396
2397         pix_add_multiply_2x128 (
2398             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2399             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2400             &xmm_dst_lo, &xmm_dst_hi);
2401
2402         save_128_aligned (
2403             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2404
2405         ps += 4;
2406         pd += 4;
2407         pm += 4;
2408         w -= 4;
2409     }
2410
2411     while (w)
2412     {
2413         s = *ps++;
2414         m = *pm++;
2415         d = *pd;
2416
2417         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2418         w--;
2419     }
2420 }
2421
2422 static force_inline uint32_t
2423 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2424                                 uint32_t mask,
2425                                 uint32_t dst)
2426 {
2427     __m64 a = unpack_32_1x64 (mask);
2428     __m64 s = unpack_32_1x64 (src);
2429     __m64 d = unpack_32_1x64 (dst);
2430
2431     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2432                                        a, expand_alpha_1x64 (s)));
2433     __m64 dest      = pix_multiply_1x64 (s, a);
2434     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2435
2436     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2437                                                 &alpha_dst,
2438                                                 &dest,
2439                                                 &alpha_src));
2440 }
2441
2442 static force_inline void
2443 core_combine_xor_ca_sse2 (uint32_t *      pd,
2444                           const uint32_t *ps,
2445                           const uint32_t *pm,
2446                           int             w)
2447 {
2448     uint32_t s, m, d;
2449
2450     __m128i xmm_src_lo, xmm_src_hi;
2451     __m128i xmm_dst_lo, xmm_dst_hi;
2452     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2453     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2454     __m128i xmm_mask_lo, xmm_mask_hi;
2455
2456     /* call prefetch hint to optimize cache load*/
2457     cache_prefetch ((__m128i*)ps);
2458     cache_prefetch ((__m128i*)pd);
2459     cache_prefetch ((__m128i*)pm);
2460
2461     while (w && (unsigned long)pd & 15)
2462     {
2463         s = *ps++;
2464         m = *pm++;
2465         d = *pd;
2466
2467         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2468         w--;
2469     }
2470
2471     /* call prefetch hint to optimize cache load*/
2472     cache_prefetch ((__m128i*)ps);
2473     cache_prefetch ((__m128i*)pd);
2474     cache_prefetch ((__m128i*)pm);
2475
2476     while (w >= 4)
2477     {
2478         /* fill cache line with next memory */
2479         cache_prefetch_next ((__m128i*)ps);
2480         cache_prefetch_next ((__m128i*)pd);
2481         cache_prefetch_next ((__m128i*)pm);
2482
2483         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2484         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2485         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2486
2487         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2488         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2489         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2490
2491         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2492                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2493         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2494                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2495
2496         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2497                             &xmm_mask_lo, &xmm_mask_hi,
2498                             &xmm_src_lo, &xmm_src_hi);
2499         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2500                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2501                             &xmm_mask_lo, &xmm_mask_hi);
2502
2503         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2504                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2505         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2506                       &xmm_mask_lo, &xmm_mask_hi);
2507
2508         pix_add_multiply_2x128 (
2509             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2510             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2511             &xmm_dst_lo, &xmm_dst_hi);
2512
2513         save_128_aligned (
2514             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2515
2516         ps += 4;
2517         pd += 4;
2518         pm += 4;
2519         w -= 4;
2520     }
2521
2522     while (w)
2523     {
2524         s = *ps++;
2525         m = *pm++;
2526         d = *pd;
2527
2528         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2529         w--;
2530     }
2531 }
2532
2533 static force_inline void
2534 core_combine_add_ca_sse2 (uint32_t *      pd,
2535                           const uint32_t *ps,
2536                           const uint32_t *pm,
2537                           int             w)
2538 {
2539     uint32_t s, m, d;
2540
2541     __m128i xmm_src_lo, xmm_src_hi;
2542     __m128i xmm_dst_lo, xmm_dst_hi;
2543     __m128i xmm_mask_lo, xmm_mask_hi;
2544
2545     /* call prefetch hint to optimize cache load*/
2546     cache_prefetch ((__m128i*)ps);
2547     cache_prefetch ((__m128i*)pd);
2548     cache_prefetch ((__m128i*)pm);
2549
2550     while (w && (unsigned long)pd & 15)
2551     {
2552         s = *ps++;
2553         m = *pm++;
2554         d = *pd;
2555
2556         *pd++ = pack_1x64_32 (
2557             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2558                                              unpack_32_1x64 (m)),
2559                           unpack_32_1x64 (d)));
2560         w--;
2561     }
2562
2563     /* call prefetch hint to optimize cache load*/
2564     cache_prefetch ((__m128i*)ps);
2565     cache_prefetch ((__m128i*)pd);
2566     cache_prefetch ((__m128i*)pm);
2567
2568     while (w >= 4)
2569     {
2570         /* fill cache line with next memory */
2571         cache_prefetch_next ((__m128i*)ps);
2572         cache_prefetch_next ((__m128i*)pd);
2573         cache_prefetch_next ((__m128i*)pm);
2574
2575         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2576         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2577         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2578
2579         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2580         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2581         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2582
2583         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2584                             &xmm_mask_lo, &xmm_mask_hi,
2585                             &xmm_src_lo, &xmm_src_hi);
2586
2587         save_128_aligned (
2588             (__m128i*)pd, pack_2x128_128 (
2589                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2590                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2591
2592         ps += 4;
2593         pd += 4;
2594         pm += 4;
2595         w -= 4;
2596     }
2597
2598     while (w)
2599     {
2600         s = *ps++;
2601         m = *pm++;
2602         d = *pd;
2603
2604         *pd++ = pack_1x64_32 (
2605             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2606                                              unpack_32_1x64 (m)),
2607                           unpack_32_1x64 (d)));
2608         w--;
2609     }
2610 }
2611
2612 /* ---------------------------------------------------
2613  * fb_compose_setup_sSE2
2614  */
2615 static force_inline __m64
2616 create_mask_16_64 (uint16_t mask)
2617 {
2618     return _mm_set1_pi16 (mask);
2619 }
2620
2621 static force_inline __m128i
2622 create_mask_16_128 (uint16_t mask)
2623 {
2624     return _mm_set1_epi16 (mask);
2625 }
2626
2627 static force_inline __m64
2628 create_mask_2x32_64 (uint32_t mask0,
2629                      uint32_t mask1)
2630 {
2631     return _mm_set_pi32 (mask0, mask1);
2632 }
2633
2634 /* Work around a code generation bug in Sun Studio 12. */
2635 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2636 # define create_mask_2x32_128(mask0, mask1) \
2637         (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2638 #else
2639 static force_inline __m128i
2640 create_mask_2x32_128 (uint32_t mask0,
2641                       uint32_t mask1)
2642 {
2643     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2644 }
2645 #endif
2646
2647 /* SSE2 code patch for fbcompose.c */
2648
2649 static void
2650 sse2_combine_over_u (pixman_implementation_t *imp,
2651                      pixman_op_t              op,
2652                      uint32_t *               dst,
2653                      const uint32_t *         src,
2654                      const uint32_t *         mask,
2655                      int                      width)
2656 {
2657     core_combine_over_u_sse2 (dst, src, mask, width);
2658     _mm_empty ();
2659 }
2660
2661 static void
2662 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2663                              pixman_op_t              op,
2664                              uint32_t *               dst,
2665                              const uint32_t *         src,
2666                              const uint32_t *         mask,
2667                              int                      width)
2668 {
2669     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2670     _mm_empty ();
2671 }
2672
2673 static void
2674 sse2_combine_in_u (pixman_implementation_t *imp,
2675                    pixman_op_t              op,
2676                    uint32_t *               dst,
2677                    const uint32_t *         src,
2678                    const uint32_t *         mask,
2679                    int                      width)
2680 {
2681     core_combine_in_u_sse2 (dst, src, mask, width);
2682     _mm_empty ();
2683 }
2684
2685 static void
2686 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2687                            pixman_op_t              op,
2688                            uint32_t *               dst,
2689                            const uint32_t *         src,
2690                            const uint32_t *         mask,
2691                            int                      width)
2692 {
2693     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2694     _mm_empty ();
2695 }
2696
2697 static void
2698 sse2_combine_out_u (pixman_implementation_t *imp,
2699                     pixman_op_t              op,
2700                     uint32_t *               dst,
2701                     const uint32_t *         src,
2702                     const uint32_t *         mask,
2703                     int                      width)
2704 {
2705     core_combine_out_u_sse2 (dst, src, mask, width);
2706     _mm_empty ();
2707 }
2708
2709 static void
2710 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2711                             pixman_op_t              op,
2712                             uint32_t *               dst,
2713                             const uint32_t *         src,
2714                             const uint32_t *         mask,
2715                             int                      width)
2716 {
2717     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2718     _mm_empty ();
2719 }
2720
2721 static void
2722 sse2_combine_atop_u (pixman_implementation_t *imp,
2723                      pixman_op_t              op,
2724                      uint32_t *               dst,
2725                      const uint32_t *         src,
2726                      const uint32_t *         mask,
2727                      int                      width)
2728 {
2729     core_combine_atop_u_sse2 (dst, src, mask, width);
2730     _mm_empty ();
2731 }
2732
2733 static void
2734 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2735                              pixman_op_t              op,
2736                              uint32_t *               dst,
2737                              const uint32_t *         src,
2738                              const uint32_t *         mask,
2739                              int                      width)
2740 {
2741     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2742     _mm_empty ();
2743 }
2744
2745 static void
2746 sse2_combine_xor_u (pixman_implementation_t *imp,
2747                     pixman_op_t              op,
2748                     uint32_t *               dst,
2749                     const uint32_t *         src,
2750                     const uint32_t *         mask,
2751                     int                      width)
2752 {
2753     core_combine_xor_u_sse2 (dst, src, mask, width);
2754     _mm_empty ();
2755 }
2756
2757 static void
2758 sse2_combine_add_u (pixman_implementation_t *imp,
2759                     pixman_op_t              op,
2760                     uint32_t *               dst,
2761                     const uint32_t *         src,
2762                     const uint32_t *         mask,
2763                     int                      width)
2764 {
2765     core_combine_add_u_sse2 (dst, src, mask, width);
2766     _mm_empty ();
2767 }
2768
2769 static void
2770 sse2_combine_saturate_u (pixman_implementation_t *imp,
2771                          pixman_op_t              op,
2772                          uint32_t *               dst,
2773                          const uint32_t *         src,
2774                          const uint32_t *         mask,
2775                          int                      width)
2776 {
2777     core_combine_saturate_u_sse2 (dst, src, mask, width);
2778     _mm_empty ();
2779 }
2780
2781 static void
2782 sse2_combine_src_ca (pixman_implementation_t *imp,
2783                      pixman_op_t              op,
2784                      uint32_t *               dst,
2785                      const uint32_t *         src,
2786                      const uint32_t *         mask,
2787                      int                      width)
2788 {
2789     core_combine_src_ca_sse2 (dst, src, mask, width);
2790     _mm_empty ();
2791 }
2792
2793 static void
2794 sse2_combine_over_ca (pixman_implementation_t *imp,
2795                       pixman_op_t              op,
2796                       uint32_t *               dst,
2797                       const uint32_t *         src,
2798                       const uint32_t *         mask,
2799                       int                      width)
2800 {
2801     core_combine_over_ca_sse2 (dst, src, mask, width);
2802     _mm_empty ();
2803 }
2804
2805 static void
2806 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2807                               pixman_op_t              op,
2808                               uint32_t *               dst,
2809                               const uint32_t *         src,
2810                               const uint32_t *         mask,
2811                               int                      width)
2812 {
2813     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2814     _mm_empty ();
2815 }
2816
2817 static void
2818 sse2_combine_in_ca (pixman_implementation_t *imp,
2819                     pixman_op_t              op,
2820                     uint32_t *               dst,
2821                     const uint32_t *         src,
2822                     const uint32_t *         mask,
2823                     int                      width)
2824 {
2825     core_combine_in_ca_sse2 (dst, src, mask, width);
2826     _mm_empty ();
2827 }
2828
2829 static void
2830 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2831                             pixman_op_t              op,
2832                             uint32_t *               dst,
2833                             const uint32_t *         src,
2834                             const uint32_t *         mask,
2835                             int                      width)
2836 {
2837     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2838     _mm_empty ();
2839 }
2840
2841 static void
2842 sse2_combine_out_ca (pixman_implementation_t *imp,
2843                      pixman_op_t              op,
2844                      uint32_t *               dst,
2845                      const uint32_t *         src,
2846                      const uint32_t *         mask,
2847                      int                      width)
2848 {
2849     core_combine_out_ca_sse2 (dst, src, mask, width);
2850     _mm_empty ();
2851 }
2852
2853 static void
2854 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2855                              pixman_op_t              op,
2856                              uint32_t *               dst,
2857                              const uint32_t *         src,
2858                              const uint32_t *         mask,
2859                              int                      width)
2860 {
2861     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2862     _mm_empty ();
2863 }
2864
2865 static void
2866 sse2_combine_atop_ca (pixman_implementation_t *imp,
2867                       pixman_op_t              op,
2868                       uint32_t *               dst,
2869                       const uint32_t *         src,
2870                       const uint32_t *         mask,
2871                       int                      width)
2872 {
2873     core_combine_atop_ca_sse2 (dst, src, mask, width);
2874     _mm_empty ();
2875 }
2876
2877 static void
2878 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2879                               pixman_op_t              op,
2880                               uint32_t *               dst,
2881                               const uint32_t *         src,
2882                               const uint32_t *         mask,
2883                               int                      width)
2884 {
2885     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2886     _mm_empty ();
2887 }
2888
2889 static void
2890 sse2_combine_xor_ca (pixman_implementation_t *imp,
2891                      pixman_op_t              op,
2892                      uint32_t *               dst,
2893                      const uint32_t *         src,
2894                      const uint32_t *         mask,
2895                      int                      width)
2896 {
2897     core_combine_xor_ca_sse2 (dst, src, mask, width);
2898     _mm_empty ();
2899 }
2900
2901 static void
2902 sse2_combine_add_ca (pixman_implementation_t *imp,
2903                      pixman_op_t              op,
2904                      uint32_t *               dst,
2905                      const uint32_t *         src,
2906                      const uint32_t *         mask,
2907                      int                      width)
2908 {
2909     core_combine_add_ca_sse2 (dst, src, mask, width);
2910     _mm_empty ();
2911 }
2912
2913 /* -------------------------------------------------------------------
2914  * composite_over_n_8888
2915  */
2916
2917 static void
2918 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2919                             pixman_op_t              op,
2920                             pixman_image_t *         src_image,
2921                             pixman_image_t *         mask_image,
2922                             pixman_image_t *         dst_image,
2923                             int32_t                  src_x,
2924                             int32_t                  src_y,
2925                             int32_t                  mask_x,
2926                             int32_t                  mask_y,
2927                             int32_t                  dest_x,
2928                             int32_t                  dest_y,
2929                             int32_t                  width,
2930                             int32_t                  height)
2931 {
2932     uint32_t src;
2933     uint32_t    *dst_line, *dst, d;
2934     uint16_t w;
2935     int dst_stride;
2936     __m128i xmm_src, xmm_alpha;
2937     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2938
2939     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2940
2941     if (src == 0)
2942         return;
2943
2944     PIXMAN_IMAGE_GET_LINE (
2945         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2946
2947     xmm_src = expand_pixel_32_1x128 (src);
2948     xmm_alpha = expand_alpha_1x128 (xmm_src);
2949
2950     while (height--)
2951     {
2952         dst = dst_line;
2953
2954         /* call prefetch hint to optimize cache load*/
2955         cache_prefetch ((__m128i*)dst);
2956
2957         dst_line += dst_stride;
2958         w = width;
2959
2960         while (w && (unsigned long)dst & 15)
2961         {
2962             d = *dst;
2963             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2964                                               _mm_movepi64_pi64 (xmm_alpha),
2965                                               unpack_32_1x64 (d)));
2966             w--;
2967         }
2968
2969         cache_prefetch ((__m128i*)dst);
2970
2971         while (w >= 4)
2972         {
2973             /* fill cache line with next memory */
2974             cache_prefetch_next ((__m128i*)dst);
2975
2976             xmm_dst = load_128_aligned ((__m128i*)dst);
2977
2978             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2979
2980             over_2x128 (&xmm_src, &xmm_src,
2981                         &xmm_alpha, &xmm_alpha,
2982                         &xmm_dst_lo, &xmm_dst_hi);
2983
2984             /* rebuid the 4 pixel data and save*/
2985             save_128_aligned (
2986                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2987
2988             w -= 4;
2989             dst += 4;
2990         }
2991
2992         while (w)
2993         {
2994             d = *dst;
2995             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2996                                               _mm_movepi64_pi64 (xmm_alpha),
2997                                               unpack_32_1x64 (d)));
2998             w--;
2999         }
3000
3001     }
3002     _mm_empty ();
3003 }
3004
3005 /* ---------------------------------------------------------------------
3006  * composite_over_n_0565
3007  */
3008 static void
3009 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3010                             pixman_op_t              op,
3011                             pixman_image_t *         src_image,
3012                             pixman_image_t *         mask_image,
3013                             pixman_image_t *         dst_image,
3014                             int32_t                  src_x,
3015                             int32_t                  src_y,
3016                             int32_t                  mask_x,
3017                             int32_t                  mask_y,
3018                             int32_t                  dest_x,
3019                             int32_t                  dest_y,
3020                             int32_t                  width,
3021                             int32_t                  height)
3022 {
3023     uint32_t src;
3024     uint16_t    *dst_line, *dst, d;
3025     uint16_t w;
3026     int dst_stride;
3027     __m128i xmm_src, xmm_alpha;
3028     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3029
3030     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3031
3032     if (src == 0)
3033         return;
3034
3035     PIXMAN_IMAGE_GET_LINE (
3036         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3037
3038     xmm_src = expand_pixel_32_1x128 (src);
3039     xmm_alpha = expand_alpha_1x128 (xmm_src);
3040
3041     while (height--)
3042     {
3043         dst = dst_line;
3044
3045         /* call prefetch hint to optimize cache load*/
3046         cache_prefetch ((__m128i*)dst);
3047
3048         dst_line += dst_stride;
3049         w = width;
3050
3051         while (w && (unsigned long)dst & 15)
3052         {
3053             d = *dst;
3054
3055             *dst++ = pack_565_32_16 (
3056                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3057                                          _mm_movepi64_pi64 (xmm_alpha),
3058                                          expand565_16_1x64 (d))));
3059             w--;
3060         }
3061
3062         /* call prefetch hint to optimize cache load*/
3063         cache_prefetch ((__m128i*)dst);
3064
3065         while (w >= 8)
3066         {
3067             /* fill cache line with next memory */
3068             cache_prefetch_next ((__m128i*)dst);
3069
3070             xmm_dst = load_128_aligned ((__m128i*)dst);
3071
3072             unpack_565_128_4x128 (xmm_dst,
3073                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3074
3075             over_2x128 (&xmm_src, &xmm_src,
3076                         &xmm_alpha, &xmm_alpha,
3077                         &xmm_dst0, &xmm_dst1);
3078             over_2x128 (&xmm_src, &xmm_src,
3079                         &xmm_alpha, &xmm_alpha,
3080                         &xmm_dst2, &xmm_dst3);
3081
3082             xmm_dst = pack_565_4x128_128 (
3083                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3084
3085             save_128_aligned ((__m128i*)dst, xmm_dst);
3086
3087             dst += 8;
3088             w -= 8;
3089         }
3090
3091         while (w--)
3092         {
3093             d = *dst;
3094             *dst++ = pack_565_32_16 (
3095                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3096                                          _mm_movepi64_pi64 (xmm_alpha),
3097                                          expand565_16_1x64 (d))));
3098         }
3099     }
3100
3101     _mm_empty ();
3102 }
3103
3104 /* ------------------------------
3105  * composite_add_n_8888_8888_ca
3106  */
3107 static void
3108 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3109                                    pixman_op_t              op,
3110                                    pixman_image_t *         src_image,
3111                                    pixman_image_t *         mask_image,
3112                                    pixman_image_t *         dst_image,
3113                                    int32_t                  src_x,
3114                                    int32_t                  src_y,
3115                                    int32_t                  mask_x,
3116                                    int32_t                  mask_y,
3117                                    int32_t                  dest_x,
3118                                    int32_t                  dest_y,
3119                                    int32_t                  width,
3120                                    int32_t                  height)
3121 {
3122     uint32_t src, srca;
3123     uint32_t    *dst_line, d;
3124     uint32_t    *mask_line, m;
3125     uint32_t pack_cmp;
3126     int dst_stride, mask_stride;
3127
3128     __m128i xmm_src, xmm_alpha;
3129     __m128i xmm_dst;
3130     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3131
3132     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3133
3134     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3135     srca = src >> 24;
3136     
3137     if (src == 0)
3138         return;
3139
3140     PIXMAN_IMAGE_GET_LINE (
3141         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3142     PIXMAN_IMAGE_GET_LINE (
3143         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3144
3145     xmm_src = _mm_unpacklo_epi8 (
3146         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3147     xmm_alpha = expand_alpha_1x128 (xmm_src);
3148     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3149     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3150
3151     while (height--)
3152     {
3153         int w = width;
3154         const uint32_t *pm = (uint32_t *)mask_line;
3155         uint32_t *pd = (uint32_t *)dst_line;
3156
3157         dst_line += dst_stride;
3158         mask_line += mask_stride;
3159
3160         /* call prefetch hint to optimize cache load*/
3161         cache_prefetch ((__m128i*)pd);
3162         cache_prefetch ((__m128i*)pm);
3163
3164         while (w && (unsigned long)pd & 15)
3165         {
3166             m = *pm++;
3167
3168             if (m)
3169             {
3170                 d = *pd;
3171                 
3172                 mmx_mask = unpack_32_1x64 (m);
3173                 mmx_dest = unpack_32_1x64 (d);
3174
3175                 *pd = pack_1x64_32 (
3176                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3177             }
3178
3179             pd++;
3180             w--;
3181         }
3182
3183         /* call prefetch hint to optimize cache load*/
3184         cache_prefetch ((__m128i*)pd);
3185         cache_prefetch ((__m128i*)pm);
3186
3187         while (w >= 4)
3188         {
3189             /* fill cache line with next memory */
3190             cache_prefetch_next ((__m128i*)pd);
3191             cache_prefetch_next ((__m128i*)pm);
3192
3193             xmm_mask = load_128_unaligned ((__m128i*)pm);
3194
3195             pack_cmp =
3196                 _mm_movemask_epi8 (
3197                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3198
3199             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3200             if (pack_cmp != 0xffff)
3201             {
3202                 xmm_dst = load_128_aligned ((__m128i*)pd);
3203
3204                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3205
3206                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3207                                     &xmm_mask_lo, &xmm_mask_hi,
3208                                     &xmm_mask_lo, &xmm_mask_hi);
3209                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3210                 
3211                 save_128_aligned (
3212                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3213             }
3214
3215             pd += 4;
3216             pm += 4;
3217             w -= 4;
3218         }
3219
3220         while (w)
3221         {
3222             m = *pm++;
3223
3224             if (m)
3225             {
3226                 d = *pd;
3227                 
3228                 mmx_mask = unpack_32_1x64 (m);
3229                 mmx_dest = unpack_32_1x64 (d);
3230
3231                 *pd = pack_1x64_32 (
3232                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3233             }
3234
3235             pd++;
3236             w--;
3237         }
3238     }
3239
3240     _mm_empty ();
3241 }
3242
3243 /* ---------------------------------------------------------------------------
3244  * composite_over_n_8888_8888_ca
3245  */
3246
3247 static void
3248 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3249                                     pixman_op_t              op,
3250                                     pixman_image_t *         src_image,
3251                                     pixman_image_t *         mask_image,
3252                                     pixman_image_t *         dst_image,
3253                                     int32_t                  src_x,
3254                                     int32_t                  src_y,
3255                                     int32_t                  mask_x,
3256                                     int32_t                  mask_y,
3257                                     int32_t                  dest_x,
3258                                     int32_t                  dest_y,
3259                                     int32_t                  width,
3260                                     int32_t                  height)
3261 {
3262     uint32_t src;
3263     uint32_t    *dst_line, d;
3264     uint32_t    *mask_line, m;
3265     uint32_t pack_cmp;
3266     int dst_stride, mask_stride;
3267
3268     __m128i xmm_src, xmm_alpha;
3269     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3270     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3271
3272     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3273
3274     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3275
3276     if (src == 0)
3277         return;
3278
3279     PIXMAN_IMAGE_GET_LINE (
3280         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3281     PIXMAN_IMAGE_GET_LINE (
3282         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3283
3284     xmm_src = _mm_unpacklo_epi8 (
3285         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3286     xmm_alpha = expand_alpha_1x128 (xmm_src);
3287     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3288     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3289
3290     while (height--)
3291     {
3292         int w = width;
3293         const uint32_t *pm = (uint32_t *)mask_line;
3294         uint32_t *pd = (uint32_t *)dst_line;
3295
3296         dst_line += dst_stride;
3297         mask_line += mask_stride;
3298
3299         /* call prefetch hint to optimize cache load*/
3300         cache_prefetch ((__m128i*)pd);
3301         cache_prefetch ((__m128i*)pm);
3302
3303         while (w && (unsigned long)pd & 15)
3304         {
3305             m = *pm++;
3306
3307             if (m)
3308             {
3309                 d = *pd;
3310                 mmx_mask = unpack_32_1x64 (m);
3311                 mmx_dest = unpack_32_1x64 (d);
3312
3313                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3314                                                   &mmx_alpha,
3315                                                   &mmx_mask,
3316                                                   &mmx_dest));
3317             }
3318
3319             pd++;
3320             w--;
3321         }
3322
3323         /* call prefetch hint to optimize cache load*/
3324         cache_prefetch ((__m128i*)pd);
3325         cache_prefetch ((__m128i*)pm);
3326
3327         while (w >= 4)
3328         {
3329             /* fill cache line with next memory */
3330             cache_prefetch_next ((__m128i*)pd);
3331             cache_prefetch_next ((__m128i*)pm);
3332
3333             xmm_mask = load_128_unaligned ((__m128i*)pm);
3334
3335             pack_cmp =
3336                 _mm_movemask_epi8 (
3337                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3338
3339             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3340             if (pack_cmp != 0xffff)
3341             {
3342                 xmm_dst = load_128_aligned ((__m128i*)pd);
3343
3344                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3345                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3346
3347                 in_over_2x128 (&xmm_src, &xmm_src,
3348                                &xmm_alpha, &xmm_alpha,
3349                                &xmm_mask_lo, &xmm_mask_hi,
3350                                &xmm_dst_lo, &xmm_dst_hi);
3351
3352                 save_128_aligned (
3353                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3354             }
3355
3356             pd += 4;
3357             pm += 4;
3358             w -= 4;
3359         }
3360
3361         while (w)
3362         {
3363             m = *pm++;
3364
3365             if (m)
3366             {
3367                 d = *pd;
3368                 mmx_mask = unpack_32_1x64 (m);
3369                 mmx_dest = unpack_32_1x64 (d);
3370
3371                 *pd = pack_1x64_32 (
3372                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3373             }
3374
3375             pd++;
3376             w--;
3377         }
3378     }
3379
3380     _mm_empty ();
3381 }
3382
3383 /*---------------------------------------------------------------------
3384  * composite_over_8888_n_8888
3385  */
3386
3387 static void
3388 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3389                                  pixman_op_t              op,
3390                                  pixman_image_t *         src_image,
3391                                  pixman_image_t *         mask_image,
3392                                  pixman_image_t *         dst_image,
3393                                  int32_t                  src_x,
3394                                  int32_t                  src_y,
3395                                  int32_t                  mask_x,
3396                                  int32_t                  mask_y,
3397                                  int32_t                  dest_x,
3398                                  int32_t                  dest_y,
3399                                  int32_t                  width,
3400                                  int32_t                  height)
3401 {
3402     uint32_t    *dst_line, *dst;
3403     uint32_t    *src_line, *src;
3404     uint32_t mask;
3405     uint16_t w;
3406     int dst_stride, src_stride;
3407
3408     __m128i xmm_mask;
3409     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3410     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3411     __m128i xmm_alpha_lo, xmm_alpha_hi;
3412
3413     PIXMAN_IMAGE_GET_LINE (
3414         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3415     PIXMAN_IMAGE_GET_LINE (
3416         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3417
3418     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3419
3420     xmm_mask = create_mask_16_128 (mask >> 24);
3421
3422     while (height--)
3423     {
3424         dst = dst_line;
3425         dst_line += dst_stride;
3426         src = src_line;
3427         src_line += src_stride;
3428         w = width;
3429
3430         /* call prefetch hint to optimize cache load*/
3431         cache_prefetch ((__m128i*)dst);
3432         cache_prefetch ((__m128i*)src);
3433
3434         while (w && (unsigned long)dst & 15)
3435         {
3436             uint32_t s = *src++;
3437             uint32_t d = *dst;
3438
3439             __m64 ms = unpack_32_1x64 (s);
3440             __m64 alpha    = expand_alpha_1x64 (ms);
3441             __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3442             __m64 alpha_dst = unpack_32_1x64 (d);
3443
3444             *dst++ = pack_1x64_32 (
3445                 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3446
3447             w--;
3448         }
3449
3450         /* call prefetch hint to optimize cache load*/
3451         cache_prefetch ((__m128i*)dst);
3452         cache_prefetch ((__m128i*)src);
3453
3454         while (w >= 4)
3455         {
3456             /* fill cache line with next memory */
3457             cache_prefetch_next ((__m128i*)dst);
3458             cache_prefetch_next ((__m128i*)src);
3459
3460             xmm_src = load_128_unaligned ((__m128i*)src);
3461             xmm_dst = load_128_aligned ((__m128i*)dst);
3462
3463             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3464             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3465             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3466                                 &xmm_alpha_lo, &xmm_alpha_hi);
3467
3468             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3469                            &xmm_alpha_lo, &xmm_alpha_hi,
3470                            &xmm_mask, &xmm_mask,
3471                            &xmm_dst_lo, &xmm_dst_hi);
3472
3473             save_128_aligned (
3474                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3475
3476             dst += 4;
3477             src += 4;
3478             w -= 4;
3479         }
3480
3481         while (w)
3482         {
3483             uint32_t s = *src++;
3484             uint32_t d = *dst;
3485
3486             __m64 ms = unpack_32_1x64 (s);
3487             __m64 alpha = expand_alpha_1x64 (ms);
3488             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3489             __m64 dest  = unpack_32_1x64 (d);
3490
3491             *dst++ = pack_1x64_32 (
3492                 in_over_1x64 (&ms, &alpha, &mask, &dest));
3493
3494             w--;
3495         }
3496     }
3497
3498     _mm_empty ();
3499 }
3500
3501 /* ---------------------------------------------------------------------
3502  * composite_over_x888_n_8888
3503  */
3504 static void
3505 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3506                                  pixman_op_t              op,
3507                                  pixman_image_t *         src_image,
3508                                  pixman_image_t *         mask_image,
3509                                  pixman_image_t *         dst_image,
3510                                  int32_t                  src_x,
3511                                  int32_t                  src_y,
3512                                  int32_t                  mask_x,
3513                                  int32_t                  mask_y,
3514                                  int32_t                  dest_x,
3515                                  int32_t                  dest_y,
3516                                  int32_t                  width,
3517                                  int32_t                  height)
3518 {
3519     uint32_t    *dst_line, *dst;
3520     uint32_t    *src_line, *src;
3521     uint32_t mask;
3522     int dst_stride, src_stride;
3523     uint16_t w;
3524
3525     __m128i xmm_mask, xmm_alpha;
3526     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3527     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3528
3529     PIXMAN_IMAGE_GET_LINE (
3530         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3531     PIXMAN_IMAGE_GET_LINE (
3532         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3533
3534     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3535
3536     xmm_mask = create_mask_16_128 (mask >> 24);
3537     xmm_alpha = mask_00ff;
3538
3539     while (height--)
3540     {
3541         dst = dst_line;
3542         dst_line += dst_stride;
3543         src = src_line;
3544         src_line += src_stride;
3545         w = width;
3546
3547         /* call prefetch hint to optimize cache load*/
3548         cache_prefetch ((__m128i*)dst);
3549         cache_prefetch ((__m128i*)src);
3550
3551         while (w && (unsigned long)dst & 15)
3552         {
3553             uint32_t s = (*src++) | 0xff000000;
3554             uint32_t d = *dst;
3555
3556             __m64 src   = unpack_32_1x64 (s);
3557             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3558             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3559             __m64 dest  = unpack_32_1x64 (d);
3560
3561             *dst++ = pack_1x64_32 (
3562                 in_over_1x64 (&src, &alpha, &mask, &dest));
3563
3564             w--;
3565         }
3566
3567         /* call prefetch hint to optimize cache load*/
3568         cache_prefetch ((__m128i*)dst);
3569         cache_prefetch ((__m128i*)src);
3570
3571         while (w >= 4)
3572         {
3573             /* fill cache line with next memory */
3574             cache_prefetch_next ((__m128i*)dst);
3575             cache_prefetch_next ((__m128i*)src);
3576
3577             xmm_src = _mm_or_si128 (
3578                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3579             xmm_dst = load_128_aligned ((__m128i*)dst);
3580
3581             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3582             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3583
3584             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3585                            &xmm_alpha, &xmm_alpha,
3586                            &xmm_mask, &xmm_mask,
3587                            &xmm_dst_lo, &xmm_dst_hi);
3588
3589             save_128_aligned (
3590                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3591
3592             dst += 4;
3593             src += 4;
3594             w -= 4;
3595
3596         }
3597
3598         while (w)
3599         {
3600             uint32_t s = (*src++) | 0xff000000;
3601             uint32_t d = *dst;
3602
3603             __m64 src  = unpack_32_1x64 (s);
3604             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3605             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3606             __m64 dest  = unpack_32_1x64 (d);
3607
3608             *dst++ = pack_1x64_32 (
3609                 in_over_1x64 (&src, &alpha, &mask, &dest));
3610
3611             w--;
3612         }
3613     }
3614
3615     _mm_empty ();
3616 }
3617
3618 /* --------------------------------------------------------------------
3619  * composite_over_8888_8888
3620  */
3621 static void
3622 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3623                                pixman_op_t              op,
3624                                pixman_image_t *         src_image,
3625                                pixman_image_t *         mask_image,
3626                                pixman_image_t *         dst_image,
3627                                int32_t                  src_x,
3628                                int32_t                  src_y,
3629                                int32_t                  mask_x,
3630                                int32_t                  mask_y,
3631                                int32_t                  dest_x,
3632                                int32_t                  dest_y,
3633                                int32_t                  width,
3634                                int32_t                  height)
3635 {
3636     int dst_stride, src_stride;
3637     uint32_t    *dst_line, *dst;
3638     uint32_t    *src_line, *src;
3639
3640     PIXMAN_IMAGE_GET_LINE (
3641         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3642     PIXMAN_IMAGE_GET_LINE (
3643         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3644
3645     dst = dst_line;
3646     src = src_line;
3647
3648     while (height--)
3649     {
3650         core_combine_over_u_sse2 (dst, src, NULL, width);
3651
3652         dst += dst_stride;
3653         src += src_stride;
3654     }
3655     _mm_empty ();
3656 }
3657
3658 /* ------------------------------------------------------------------
3659  * composite_over_8888_0565
3660  */
3661 static force_inline uint16_t
3662 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3663 {
3664     __m64 ms;
3665
3666     ms = unpack_32_1x64 (src);
3667     return pack_565_32_16 (
3668         pack_1x64_32 (
3669             over_1x64 (
3670                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3671 }
3672
3673 static void
3674 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3675                                pixman_op_t              op,
3676                                pixman_image_t *         src_image,
3677                                pixman_image_t *         mask_image,
3678                                pixman_image_t *         dst_image,
3679                                int32_t                  src_x,
3680                                int32_t                  src_y,
3681                                int32_t                  mask_x,
3682                                int32_t                  mask_y,
3683                                int32_t                  dest_x,
3684                                int32_t                  dest_y,
3685                                int32_t                  width,
3686                                int32_t                  height)
3687 {
3688     uint16_t    *dst_line, *dst, d;
3689     uint32_t    *src_line, *src, s;
3690     int dst_stride, src_stride;
3691     uint16_t w;
3692
3693     __m128i xmm_alpha_lo, xmm_alpha_hi;
3694     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3695     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3696
3697     PIXMAN_IMAGE_GET_LINE (
3698         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3699     PIXMAN_IMAGE_GET_LINE (
3700         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3701
3702 #if 0
3703     /* FIXME
3704      *
3705      * I copy the code from MMX one and keep the fixme.
3706      * If it's a problem there, probably is a problem here.
3707      */
3708     assert (src_image->drawable == mask_image->drawable);
3709 #endif
3710
3711     while (height--)
3712     {
3713         dst = dst_line;
3714         src = src_line;
3715
3716         /* call prefetch hint to optimize cache load*/
3717         cache_prefetch ((__m128i*)src);
3718         cache_prefetch ((__m128i*)dst);
3719
3720         dst_line += dst_stride;
3721         src_line += src_stride;
3722         w = width;
3723
3724         /* Align dst on a 16-byte boundary */
3725         while (w &&
3726                ((unsigned long)dst & 15))
3727         {
3728             s = *src++;
3729             d = *dst;
3730
3731             *dst++ = composite_over_8888_0565pixel (s, d);
3732             w--;
3733         }
3734
3735         /* call prefetch hint to optimize cache load*/
3736         cache_prefetch ((__m128i*)src);
3737         cache_prefetch ((__m128i*)dst);
3738
3739         /* It's a 8 pixel loop */
3740         while (w >= 8)
3741         {
3742             /* fill cache line with next memory */
3743             cache_prefetch_next ((__m128i*)src);
3744             cache_prefetch_next ((__m128i*)dst);
3745
3746             /* I'm loading unaligned because I'm not sure
3747              * about the address alignment.
3748              */
3749             xmm_src = load_128_unaligned ((__m128i*) src);
3750             xmm_dst = load_128_aligned ((__m128i*) dst);
3751
3752             /* Unpacking */
3753             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3754             unpack_565_128_4x128 (xmm_dst,
3755                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3756             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3757                                 &xmm_alpha_lo, &xmm_alpha_hi);
3758
3759             /* I'm loading next 4 pixels from memory
3760              * before to optimze the memory read.
3761              */
3762             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3763
3764             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3765                         &xmm_alpha_lo, &xmm_alpha_hi,
3766                         &xmm_dst0, &xmm_dst1);
3767
3768             /* Unpacking */
3769             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3770             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3771                                 &xmm_alpha_lo, &xmm_alpha_hi);
3772
3773             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3774                         &xmm_alpha_lo, &xmm_alpha_hi,
3775                         &xmm_dst2, &xmm_dst3);
3776
3777             save_128_aligned (
3778                 (__m128i*)dst, pack_565_4x128_128 (
3779                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3780
3781             w -= 8;
3782             dst += 8;
3783             src += 8;
3784         }
3785
3786         while (w--)
3787         {
3788             s = *src++;
3789             d = *dst;
3790
3791             *dst++ = composite_over_8888_0565pixel (s, d);
3792         }
3793     }
3794
3795     _mm_empty ();
3796 }
3797
3798 /* -----------------------------------------------------------------
3799  * composite_over_n_8_8888
3800  */
3801
3802 static void
3803 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3804                               pixman_op_t              op,
3805                               pixman_image_t *         src_image,
3806                               pixman_image_t *         mask_image,
3807                               pixman_image_t *         dst_image,
3808                               int32_t                  src_x,
3809                               int32_t                  src_y,
3810                               int32_t                  mask_x,
3811                               int32_t                  mask_y,
3812                               int32_t                  dest_x,
3813                               int32_t                  dest_y,
3814                               int32_t                  width,
3815                               int32_t                  height)
3816 {
3817     uint32_t src, srca;
3818     uint32_t *dst_line, *dst;
3819     uint8_t *mask_line, *mask;
3820     int dst_stride, mask_stride;
3821     uint16_t w;
3822     uint32_t m, d;
3823
3824     __m128i xmm_src, xmm_alpha, xmm_def;
3825     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3826     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3827
3828     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3829
3830     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3831
3832     srca = src >> 24;
3833     if (src == 0)
3834         return;
3835
3836     PIXMAN_IMAGE_GET_LINE (
3837         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3838     PIXMAN_IMAGE_GET_LINE (
3839         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3840
3841     xmm_def = create_mask_2x32_128 (src, src);
3842     xmm_src = expand_pixel_32_1x128 (src);
3843     xmm_alpha = expand_alpha_1x128 (xmm_src);
3844     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3845     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3846
3847     while (height--)
3848     {
3849         dst = dst_line;
3850         dst_line += dst_stride;
3851         mask = mask_line;
3852         mask_line += mask_stride;
3853         w = width;
3854
3855         /* call prefetch hint to optimize cache load*/
3856         cache_prefetch ((__m128i*)mask);
3857         cache_prefetch ((__m128i*)dst);
3858
3859         while (w && (unsigned long)dst & 15)
3860         {
3861             uint8_t m = *mask++;
3862
3863             if (m)
3864             {
3865                 d = *dst;
3866                 mmx_mask = expand_pixel_8_1x64 (m);
3867                 mmx_dest = unpack_32_1x64 (d);
3868
3869                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3870                                                    &mmx_alpha,
3871                                                    &mmx_mask,
3872                                                    &mmx_dest));
3873             }
3874
3875             w--;
3876             dst++;
3877         }
3878
3879         /* call prefetch hint to optimize cache load*/
3880         cache_prefetch ((__m128i*)mask);
3881         cache_prefetch ((__m128i*)dst);
3882
3883         while (w >= 4)
3884         {
3885             /* fill cache line with next memory */
3886             cache_prefetch_next ((__m128i*)mask);
3887             cache_prefetch_next ((__m128i*)dst);
3888
3889             m = *((uint32_t*)mask);
3890
3891             if (srca == 0xff && m == 0xffffffff)
3892             {
3893                 save_128_aligned ((__m128i*)dst, xmm_def);
3894             }
3895             else if (m)
3896             {
3897                 xmm_dst = load_128_aligned ((__m128i*) dst);
3898                 xmm_mask = unpack_32_1x128 (m);
3899                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3900
3901                 /* Unpacking */
3902                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3903                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3904
3905                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3906                                         &xmm_mask_lo, &xmm_mask_hi);
3907
3908                 in_over_2x128 (&xmm_src, &xmm_src,
3909                                &xmm_alpha, &xmm_alpha,
3910                                &xmm_mask_lo, &xmm_mask_hi,
3911                                &xmm_dst_lo, &xmm_dst_hi);
3912
3913                 save_128_aligned (
3914                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3915             }
3916
3917             w -= 4;
3918             dst += 4;
3919             mask += 4;
3920         }
3921
3922         while (w)
3923         {
3924             uint8_t m = *mask++;
3925
3926             if (m)
3927             {
3928                 d = *dst;
3929                 mmx_mask = expand_pixel_8_1x64 (m);
3930                 mmx_dest = unpack_32_1x64 (d);
3931
3932                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3933                                                    &mmx_alpha,
3934                                                    &mmx_mask,
3935                                                    &mmx_dest));
3936             }
3937
3938             w--;
3939             dst++;
3940         }
3941     }
3942
3943     _mm_empty ();
3944 }
3945
3946 /* ----------------------------------------------------------------
3947  * composite_over_n_8_8888
3948  */
3949
3950 pixman_bool_t
3951 pixman_fill_sse2 (uint32_t *bits,
3952                   int       stride,
3953                   int       bpp,
3954                   int       x,
3955                   int       y,
3956                   int       width,
3957                   int       height,
3958                   uint32_t  data)
3959 {
3960     uint32_t byte_width;
3961     uint8_t         *byte_line;
3962
3963     __m128i xmm_def;
3964
3965     if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3966         return FALSE;
3967
3968     if (bpp != 16 && bpp != 32)
3969         return FALSE;
3970
3971     if (bpp == 16)
3972     {
3973         stride = stride * (int) sizeof (uint32_t) / 2;
3974         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3975         byte_width = 2 * width;
3976         stride *= 2;
3977     }
3978     else
3979     {
3980         stride = stride * (int) sizeof (uint32_t) / 4;
3981         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3982         byte_width = 4 * width;
3983         stride *= 4;
3984     }
3985
3986     cache_prefetch ((__m128i*)byte_line);
3987     xmm_def = create_mask_2x32_128 (data, data);
3988
3989     while (height--)
3990     {
3991         int w;
3992         uint8_t *d = byte_line;
3993         byte_line += stride;
3994         w = byte_width;
3995
3996
3997         cache_prefetch_next ((__m128i*)d);
3998
3999         while (w >= 2 && ((unsigned long)d & 3))
4000         {
4001             *(uint16_t *)d = data;
4002             w -= 2;
4003             d += 2;
4004         }
4005
4006         while (w >= 4 && ((unsigned long)d & 15))
4007         {
4008             *(uint32_t *)d = data;
4009
4010             w -= 4;
4011             d += 4;
4012         }
4013
4014         cache_prefetch_next ((__m128i*)d);
4015
4016         while (w >= 128)
4017         {
4018             cache_prefetch (((__m128i*)d) + 12);
4019
4020             save_128_aligned ((__m128i*)(d),     xmm_def);
4021             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4022             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4023             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4024             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
4025             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
4026             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
4027             save_128_aligned ((__m128i*)(d + 112), xmm_def);
4028
4029             d += 128;
4030             w -= 128;
4031         }
4032
4033         if (w >= 64)
4034         {
4035             cache_prefetch (((__m128i*)d) + 8);
4036
4037             save_128_aligned ((__m128i*)(d),     xmm_def);
4038             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4039             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4040             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4041
4042             d += 64;
4043             w -= 64;
4044         }
4045
4046         cache_prefetch_next ((__m128i*)d);
4047
4048         if (w >= 32)
4049         {
4050             save_128_aligned ((__m128i*)(d),     xmm_def);
4051             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4052
4053             d += 32;
4054             w -= 32;
4055         }
4056
4057         if (w >= 16)
4058         {
4059             save_128_aligned ((__m128i*)(d),     xmm_def);
4060
4061             d += 16;
4062             w -= 16;
4063         }
4064
4065         cache_prefetch_next ((__m128i*)d);
4066
4067         while (w >= 4)
4068         {
4069             *(uint32_t *)d = data;
4070
4071             w -= 4;
4072             d += 4;
4073         }
4074
4075         if (w >= 2)
4076         {
4077             *(uint16_t *)d = data;
4078             w -= 2;
4079             d += 2;
4080         }
4081     }
4082
4083     _mm_empty ();
4084     return TRUE;
4085 }
4086
4087 static void
4088 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4089                              pixman_op_t              op,
4090                              pixman_image_t *         src_image,
4091                              pixman_image_t *         mask_image,
4092                              pixman_image_t *         dst_image,
4093                              int32_t                  src_x,
4094                              int32_t                  src_y,
4095                              int32_t                  mask_x,
4096                              int32_t                  mask_y,
4097                              int32_t                  dest_x,
4098                              int32_t                  dest_y,
4099                              int32_t                  width,
4100                              int32_t                  height)
4101 {
4102     uint32_t src, srca;
4103     uint32_t    *dst_line, *dst;
4104     uint8_t     *mask_line, *mask;
4105     int dst_stride, mask_stride;
4106     uint16_t w;
4107     uint32_t m;
4108
4109     __m128i xmm_src, xmm_def;
4110     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4111
4112     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4113
4114     srca = src >> 24;
4115     if (src == 0)
4116     {
4117         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4118                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
4119                           dest_x, dest_y, width, height, 0);
4120         return;
4121     }
4122
4123     PIXMAN_IMAGE_GET_LINE (
4124         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4125     PIXMAN_IMAGE_GET_LINE (
4126         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4127
4128     xmm_def = create_mask_2x32_128 (src, src);
4129     xmm_src = expand_pixel_32_1x128 (src);
4130
4131     while (height--)
4132     {
4133         dst = dst_line;
4134         dst_line += dst_stride;
4135         mask = mask_line;
4136         mask_line += mask_stride;
4137         w = width;
4138
4139         /* call prefetch hint to optimize cache load*/
4140         cache_prefetch ((__m128i*)mask);
4141         cache_prefetch ((__m128i*)dst);
4142
4143         while (w && (unsigned long)dst & 15)
4144         {
4145             uint8_t m = *mask++;
4146
4147             if (m)
4148             {
4149                 *dst = pack_1x64_32 (
4150                     pix_multiply_1x64 (
4151                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4152             }
4153             else
4154             {
4155                 *dst = 0;
4156             }
4157
4158             w--;
4159             dst++;
4160         }
4161
4162         /* call prefetch hint to optimize cache load*/
4163         cache_prefetch ((__m128i*)mask);
4164         cache_prefetch ((__m128i*)dst);
4165
4166         while (w >= 4)
4167         {
4168             /* fill cache line with next memory */
4169             cache_prefetch_next ((__m128i*)mask);
4170             cache_prefetch_next ((__m128i*)dst);
4171
4172             m = *((uint32_t*)mask);
4173
4174             if (srca == 0xff && m == 0xffffffff)
4175             {
4176                 save_128_aligned ((__m128i*)dst, xmm_def);
4177             }
4178             else if (m)
4179             {
4180                 xmm_mask = unpack_32_1x128 (m);
4181                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4182
4183                 /* Unpacking */
4184                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4185
4186                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4187                                         &xmm_mask_lo, &xmm_mask_hi);
4188
4189                 pix_multiply_2x128 (&xmm_src, &xmm_src,
4190                                     &xmm_mask_lo, &xmm_mask_hi,
4191                                     &xmm_mask_lo, &xmm_mask_hi);
4192
4193                 save_128_aligned (
4194                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4195             }
4196             else
4197             {
4198                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4199             }
4200
4201             w -= 4;
4202             dst += 4;
4203             mask += 4;
4204         }
4205
4206         while (w)
4207         {
4208             uint8_t m = *mask++;
4209
4210             if (m)
4211             {
4212                 *dst = pack_1x64_32 (
4213                     pix_multiply_1x64 (
4214                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4215             }
4216             else
4217             {
4218                 *dst = 0;
4219             }
4220
4221             w--;
4222             dst++;
4223         }
4224     }
4225
4226     _mm_empty ();
4227 }
4228
4229 /*-----------------------------------------------------------------------
4230  * composite_over_n_8_0565
4231  */
4232
4233 static void
4234 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4235                               pixman_op_t              op,
4236                               pixman_image_t *         src_image,
4237                               pixman_image_t *         mask_image,
4238                               pixman_image_t *         dst_image,
4239                               int32_t                  src_x,
4240                               int32_t                  src_y,
4241                               int32_t                  mask_x,
4242                               int32_t                  mask_y,
4243                               int32_t                  dest_x,
4244                               int32_t                  dest_y,
4245                               int32_t                  width,
4246                               int32_t                  height)
4247 {
4248     uint32_t src, srca;
4249     uint16_t    *dst_line, *dst, d;
4250     uint8_t     *mask_line, *mask;
4251     int dst_stride, mask_stride;
4252     uint16_t w;
4253     uint32_t m;
4254     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4255
4256     __m128i xmm_src, xmm_alpha;
4257     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4258     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4259
4260     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4261
4262     srca = src >> 24;
4263     if (src == 0)
4264         return;
4265
4266     PIXMAN_IMAGE_GET_LINE (
4267         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4268     PIXMAN_IMAGE_GET_LINE (
4269         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4270
4271     xmm_src = expand_pixel_32_1x128 (src);
4272     xmm_alpha = expand_alpha_1x128 (xmm_src);
4273     mmx_src = _mm_movepi64_pi64 (xmm_src);
4274     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4275
4276     while (height--)
4277     {
4278         dst = dst_line;
4279         dst_line += dst_stride;
4280         mask = mask_line;
4281         mask_line += mask_stride;
4282         w = width;
4283
4284         /* call prefetch hint to optimize cache load*/
4285         cache_prefetch ((__m128i*)mask);
4286         cache_prefetch ((__m128i*)dst);
4287
4288         while (w && (unsigned long)dst & 15)
4289         {
4290             m = *mask++;
4291
4292             if (m)
4293             {
4294                 d = *dst;
4295                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4296                 mmx_dest = expand565_16_1x64 (d);
4297
4298                 *dst = pack_565_32_16 (
4299                     pack_1x64_32 (
4300                         in_over_1x64 (
4301                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4302             }
4303
4304             w--;
4305             dst++;
4306         }
4307
4308         /* call prefetch hint to optimize cache load*/
4309         cache_prefetch ((__m128i*)mask);
4310         cache_prefetch ((__m128i*)dst);
4311
4312         while (w >= 8)
4313         {
4314             /* fill cache line with next memory */
4315             cache_prefetch_next ((__m128i*)mask);
4316             cache_prefetch_next ((__m128i*)dst);
4317
4318             xmm_dst = load_128_aligned ((__m128i*) dst);
4319             unpack_565_128_4x128 (xmm_dst,
4320                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4321
4322             m = *((uint32_t*)mask);
4323             mask += 4;
4324
4325             if (m)
4326             {
4327                 xmm_mask = unpack_32_1x128 (m);
4328                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4329
4330                 /* Unpacking */
4331                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4332
4333                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4334                                         &xmm_mask_lo, &xmm_mask_hi);
4335
4336                 in_over_2x128 (&xmm_src, &xmm_src,
4337                                &xmm_alpha, &xmm_alpha,
4338                                &xmm_mask_lo, &xmm_mask_hi,
4339                                &xmm_dst0, &xmm_dst1);
4340             }
4341
4342             m = *((uint32_t*)mask);
4343             mask += 4;
4344
4345             if (m)
4346             {
4347                 xmm_mask = unpack_32_1x128 (m);
4348                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4349
4350                 /* Unpacking */
4351                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4352
4353                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4354                                         &xmm_mask_lo, &xmm_mask_hi);
4355                 in_over_2x128 (&xmm_src, &xmm_src,
4356                                &xmm_alpha, &xmm_alpha,
4357                                &xmm_mask_lo, &xmm_mask_hi,
4358                                &xmm_dst2, &xmm_dst3);
4359             }
4360
4361             save_128_aligned (
4362                 (__m128i*)dst, pack_565_4x128_128 (
4363                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4364
4365             w -= 8;
4366             dst += 8;
4367         }
4368
4369         while (w)
4370         {
4371             m = *mask++;
4372
4373             if (m)
4374             {
4375                 d = *dst;
4376                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4377                 mmx_dest = expand565_16_1x64 (d);
4378
4379                 *dst = pack_565_32_16 (
4380                     pack_1x64_32 (
4381                         in_over_1x64 (
4382                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4383             }
4384
4385             w--;
4386             dst++;
4387         }
4388     }
4389
4390     _mm_empty ();
4391 }
4392
4393 /* -----------------------------------------------------------------------
4394  * composite_over_pixbuf_0565
4395  */
4396
4397 static void
4398 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4399                                  pixman_op_t              op,
4400                                  pixman_image_t *         src_image,
4401                                  pixman_image_t *         mask_image,
4402                                  pixman_image_t *         dst_image,
4403                                  int32_t                  src_x,
4404                                  int32_t                  src_y,
4405                                  int32_t                  mask_x,
4406                                  int32_t                  mask_y,
4407                                  int32_t                  dest_x,
4408                                  int32_t                  dest_y,
4409                                  int32_t                  width,
4410                                  int32_t                  height)
4411 {
4412     uint16_t    *dst_line, *dst, d;
4413     uint32_t    *src_line, *src, s;
4414     int dst_stride, src_stride;
4415     uint16_t w;
4416     uint32_t opaque, zero;
4417
4418     __m64 ms;
4419     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4420     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4421
4422     PIXMAN_IMAGE_GET_LINE (
4423         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4424     PIXMAN_IMAGE_GET_LINE (
4425         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4426
4427 #if 0
4428     /* FIXME
4429      *
4430      * I copy the code from MMX one and keep the fixme.
4431      * If it's a problem there, probably is a problem here.
4432      */
4433     assert (src_image->drawable == mask_image->drawable);
4434 #endif
4435
4436     while (height--)
4437     {
4438         dst = dst_line;
4439         dst_line += dst_stride;
4440         src = src_line;
4441         src_line += src_stride;
4442         w = width;
4443
4444         /* call prefetch hint to optimize cache load*/
4445         cache_prefetch ((__m128i*)src);
4446         cache_prefetch ((__m128i*)dst);
4447
4448         while (w && (unsigned long)dst & 15)
4449         {
4450             s = *src++;
4451             d = *dst;
4452
4453             ms = unpack_32_1x64 (s);
4454
4455             *dst++ = pack_565_32_16 (
4456                 pack_1x64_32 (
4457                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4458             w--;
4459         }
4460
4461         /* call prefetch hint to optimize cache load*/
4462         cache_prefetch ((__m128i*)src);
4463         cache_prefetch ((__m128i*)dst);
4464
4465         while (w >= 8)
4466         {
4467             /* fill cache line with next memory */
4468             cache_prefetch_next ((__m128i*)src);
4469             cache_prefetch_next ((__m128i*)dst);
4470
4471             /* First round */
4472             xmm_src = load_128_unaligned ((__m128i*)src);
4473             xmm_dst = load_128_aligned  ((__m128i*)dst);
4474
4475             opaque = is_opaque (xmm_src);
4476             zero = is_zero (xmm_src);
4477
4478             unpack_565_128_4x128 (xmm_dst,
4479                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4480             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4481
4482             /* preload next round*/
4483             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4484
4485             if (opaque)
4486             {
4487                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4488                                      &xmm_dst0, &xmm_dst1);
4489             }
4490             else if (!zero)
4491             {
4492                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4493                                         &xmm_dst0, &xmm_dst1);
4494             }
4495
4496             /* Second round */
4497             opaque = is_opaque (xmm_src);
4498             zero = is_zero (xmm_src);
4499
4500             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4501
4502             if (opaque)
4503             {
4504                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4505                                      &xmm_dst2, &xmm_dst3);
4506             }
4507             else if (!zero)
4508             {
4509                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4510                                         &xmm_dst2, &xmm_dst3);
4511             }
4512
4513             save_128_aligned (
4514                 (__m128i*)dst, pack_565_4x128_128 (
4515                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4516
4517             w -= 8;
4518             src += 8;
4519             dst += 8;
4520         }
4521
4522         while (w)
4523         {
4524             s = *src++;
4525             d = *dst;
4526
4527             ms = unpack_32_1x64 (s);
4528
4529             *dst++ = pack_565_32_16 (
4530                 pack_1x64_32 (
4531                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4532             w--;
4533         }
4534     }
4535
4536     _mm_empty ();
4537 }
4538
4539 /* -------------------------------------------------------------------------
4540  * composite_over_pixbuf_8888
4541  */
4542
4543 static void
4544 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4545                                  pixman_op_t              op,
4546                                  pixman_image_t *         src_image,
4547                                  pixman_image_t *         mask_image,
4548                                  pixman_image_t *         dst_image,
4549                                  int32_t                  src_x,
4550                                  int32_t                  src_y,
4551                                  int32_t                  mask_x,
4552                                  int32_t                  mask_y,
4553                                  int32_t                  dest_x,
4554                                  int32_t                  dest_y,
4555                                  int32_t                  width,
4556                                  int32_t                  height)
4557 {
4558     uint32_t    *dst_line, *dst, d;
4559     uint32_t    *src_line, *src, s;
4560     int dst_stride, src_stride;
4561     uint16_t w;
4562     uint32_t opaque, zero;
4563
4564     __m128i xmm_src_lo, xmm_src_hi;
4565     __m128i xmm_dst_lo, xmm_dst_hi;
4566
4567     PIXMAN_IMAGE_GET_LINE (
4568         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4569     PIXMAN_IMAGE_GET_LINE (
4570         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4571
4572 #if 0
4573     /* FIXME
4574      *
4575      * I copy the code from MMX one and keep the fixme.
4576      * If it's a problem there, probably is a problem here.
4577      */
4578     assert (src_image->drawable == mask_image->drawable);
4579 #endif
4580
4581     while (height--)
4582     {
4583         dst = dst_line;
4584         dst_line += dst_stride;
4585         src = src_line;
4586         src_line += src_stride;
4587         w = width;
4588
4589         /* call prefetch hint to optimize cache load*/
4590         cache_prefetch ((__m128i*)src);
4591         cache_prefetch ((__m128i*)dst);
4592
4593         while (w && (unsigned long)dst & 15)
4594         {
4595             s = *src++;
4596             d = *dst;
4597
4598             *dst++ = pack_1x64_32 (
4599                 over_rev_non_pre_1x64 (
4600                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4601
4602             w--;
4603         }
4604
4605         /* call prefetch hint to optimize cache load*/
4606         cache_prefetch ((__m128i*)src);
4607         cache_prefetch ((__m128i*)dst);
4608
4609         while (w >= 4)
4610         {
4611             /* fill cache line with next memory */
4612             cache_prefetch_next ((__m128i*)src);
4613             cache_prefetch_next ((__m128i*)dst);
4614
4615             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4616
4617             opaque = is_opaque (xmm_src_hi);
4618             zero = is_zero (xmm_src_hi);
4619
4620             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4621
4622             if (opaque)
4623             {
4624                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4625                                      &xmm_dst_lo, &xmm_dst_hi);
4626
4627                 save_128_aligned (
4628                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4629             }
4630             else if (!zero)
4631             {
4632                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4633
4634                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4635
4636                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4637                                         &xmm_dst_lo, &xmm_dst_hi);
4638
4639                 save_128_aligned (
4640                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4641             }
4642
4643             w -= 4;
4644             dst += 4;
4645             src += 4;
4646         }
4647
4648         while (w)
4649         {
4650             s = *src++;
4651             d = *dst;
4652
4653             *dst++ = pack_1x64_32 (
4654                 over_rev_non_pre_1x64 (
4655                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4656
4657             w--;
4658         }
4659     }
4660
4661     _mm_empty ();
4662 }
4663
4664 /* -------------------------------------------------------------------------------------------------
4665  * composite_over_n_8888_0565_ca
4666  */
4667
4668 static void
4669 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4670                                     pixman_op_t              op,
4671                                     pixman_image_t *         src_image,
4672                                     pixman_image_t *         mask_image,
4673                                     pixman_image_t *         dst_image,
4674                                     int32_t                  src_x,
4675                                     int32_t                  src_y,
4676                                     int32_t                  mask_x,
4677                                     int32_t                  mask_y,
4678                                     int32_t                  dest_x,
4679                                     int32_t                  dest_y,
4680                                     int32_t                  width,
4681                                     int32_t                  height)
4682 {
4683     uint32_t src;
4684     uint16_t    *dst_line, *dst, d;
4685     uint32_t    *mask_line, *mask, m;
4686     int dst_stride, mask_stride;
4687     int w;
4688     uint32_t pack_cmp;
4689
4690     __m128i xmm_src, xmm_alpha;
4691     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4692     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4693
4694     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4695
4696     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4697
4698     if (src == 0)
4699         return;
4700
4701     PIXMAN_IMAGE_GET_LINE (
4702         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4703     PIXMAN_IMAGE_GET_LINE (
4704         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4705
4706     xmm_src = expand_pixel_32_1x128 (src);
4707     xmm_alpha = expand_alpha_1x128 (xmm_src);
4708     mmx_src = _mm_movepi64_pi64 (xmm_src);
4709     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4710
4711     while (height--)
4712     {
4713         w = width;
4714         mask = mask_line;
4715         dst = dst_line;
4716         mask_line += mask_stride;
4717         dst_line += dst_stride;
4718
4719         /* call prefetch hint to optimize cache load*/
4720         cache_prefetch ((__m128i*)mask);
4721         cache_prefetch ((__m128i*)dst);
4722
4723         while (w && ((unsigned long)dst & 15))
4724         {
4725             m = *(uint32_t *) mask;
4726
4727             if (m)
4728             {
4729                 d = *dst;
4730                 mmx_mask = unpack_32_1x64 (m);
4731                 mmx_dest = expand565_16_1x64 (d);
4732
4733                 *dst = pack_565_32_16 (
4734                     pack_1x64_32 (
4735                         in_over_1x64 (
4736                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4737             }
4738
4739             w--;
4740             dst++;
4741             mask++;
4742         }
4743
4744         /* call prefetch hint to optimize cache load*/
4745         cache_prefetch ((__m128i*)mask);
4746         cache_prefetch ((__m128i*)dst);
4747
4748         while (w >= 8)
4749         {
4750             /* fill cache line with next memory */
4751             cache_prefetch_next ((__m128i*)mask);
4752             cache_prefetch_next ((__m128i*)dst);
4753
4754             /* First round */
4755             xmm_mask = load_128_unaligned ((__m128i*)mask);
4756             xmm_dst = load_128_aligned ((__m128i*)dst);
4757
4758             pack_cmp = _mm_movemask_epi8 (
4759                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4760
4761             unpack_565_128_4x128 (xmm_dst,
4762                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4763             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4764
4765             /* preload next round */
4766             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4767
4768             /* preload next round */
4769             if (pack_cmp != 0xffff)
4770             {
4771                 in_over_2x128 (&xmm_src, &xmm_src,
4772                                &xmm_alpha, &xmm_alpha,
4773                                &xmm_mask_lo, &xmm_mask_hi,
4774                                &xmm_dst0, &xmm_dst1);
4775             }
4776
4777             /* Second round */
4778             pack_cmp = _mm_movemask_epi8 (
4779                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4780
4781             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4782
4783             if (pack_cmp != 0xffff)
4784             {
4785                 in_over_2x128 (&xmm_src, &xmm_src,
4786                                &xmm_alpha, &xmm_alpha,
4787                                &xmm_mask_lo, &xmm_mask_hi,
4788                                &xmm_dst2, &xmm_dst3);
4789             }
4790
4791             save_128_aligned (
4792                 (__m128i*)dst, pack_565_4x128_128 (
4793                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4794
4795             w -= 8;
4796             dst += 8;
4797             mask += 8;
4798         }
4799
4800         while (w)
4801         {
4802             m = *(uint32_t *) mask;
4803
4804             if (m)
4805             {
4806                 d = *dst;
4807                 mmx_mask = unpack_32_1x64 (m);
4808                 mmx_dest = expand565_16_1x64 (d);
4809
4810                 *dst = pack_565_32_16 (
4811                     pack_1x64_32 (
4812                         in_over_1x64 (
4813                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4814             }
4815
4816             w--;
4817             dst++;
4818             mask++;
4819         }
4820     }
4821
4822     _mm_empty ();
4823 }
4824
4825 /* -----------------------------------------------------------------------
4826  * composite_in_n_8_8
4827  */
4828
4829 static void
4830 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4831                          pixman_op_t              op,
4832                          pixman_image_t *         src_image,
4833                          pixman_image_t *         mask_image,
4834                          pixman_image_t *         dst_image,
4835                          int32_t                  src_x,
4836                          int32_t                  src_y,
4837                          int32_t                  mask_x,
4838                          int32_t                  mask_y,
4839                          int32_t                  dest_x,
4840                          int32_t                  dest_y,
4841                          int32_t                  width,
4842                          int32_t                  height)
4843 {
4844     uint8_t     *dst_line, *dst;
4845     uint8_t     *mask_line, *mask;
4846     int dst_stride, mask_stride;
4847     uint16_t w, d, m;
4848     uint32_t src;
4849     uint8_t sa;
4850
4851     __m128i xmm_alpha;
4852     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4853     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4854
4855     PIXMAN_IMAGE_GET_LINE (
4856         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4857     PIXMAN_IMAGE_GET_LINE (
4858         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4859
4860     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4861
4862     sa = src >> 24;
4863
4864     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4865
4866     while (height--)
4867     {
4868         dst = dst_line;
4869         dst_line += dst_stride;
4870         mask = mask_line;
4871         mask_line += mask_stride;
4872         w = width;
4873
4874         /* call prefetch hint to optimize cache load*/
4875         cache_prefetch ((__m128i*)mask);
4876         cache_prefetch ((__m128i*)dst);
4877
4878         while (w && ((unsigned long)dst & 15))
4879         {
4880             m = (uint32_t) *mask++;
4881             d = (uint32_t) *dst;
4882
4883             *dst++ = (uint8_t) pack_1x64_32 (
4884                 pix_multiply_1x64 (
4885                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4886                                        unpack_32_1x64 (m)),
4887                     unpack_32_1x64 (d)));
4888             w--;
4889         }
4890
4891         /* call prefetch hint to optimize cache load*/
4892         cache_prefetch ((__m128i*)mask);
4893         cache_prefetch ((__m128i*)dst);
4894
4895         while (w >= 16)
4896         {
4897             /* fill cache line with next memory */
4898             cache_prefetch_next ((__m128i*)mask);
4899             cache_prefetch_next ((__m128i*)dst);
4900
4901             xmm_mask = load_128_unaligned ((__m128i*)mask);
4902             xmm_dst = load_128_aligned ((__m128i*)dst);
4903
4904             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4905             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4906
4907             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4908                                 &xmm_mask_lo, &xmm_mask_hi,
4909                                 &xmm_mask_lo, &xmm_mask_hi);
4910
4911             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4912                                 &xmm_dst_lo, &xmm_dst_hi,
4913                                 &xmm_dst_lo, &xmm_dst_hi);
4914
4915             save_128_aligned (
4916                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4917
4918             mask += 16;
4919             dst += 16;
4920             w -= 16;
4921         }
4922
4923         while (w)
4924         {
4925             m = (uint32_t) *mask++;
4926             d = (uint32_t) *dst;
4927
4928             *dst++ = (uint8_t) pack_1x64_32 (
4929                 pix_multiply_1x64 (
4930                     pix_multiply_1x64 (
4931                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4932                     unpack_32_1x64 (d)));
4933             w--;
4934         }
4935     }
4936
4937     _mm_empty ();
4938 }
4939
4940 /* ---------------------------------------------------------------------------
4941  * composite_in_8_8
4942  */
4943
4944 static void
4945 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4946                        pixman_op_t              op,
4947                        pixman_image_t *         src_image,
4948                        pixman_image_t *         mask_image,
4949                        pixman_image_t *         dst_image,
4950                        int32_t                  src_x,
4951                        int32_t                  src_y,
4952                        int32_t                  mask_x,
4953                        int32_t                  mask_y,
4954                        int32_t                  dest_x,
4955                        int32_t                  dest_y,
4956                        int32_t                  width,
4957                        int32_t                  height)
4958 {
4959     uint8_t     *dst_line, *dst;
4960     uint8_t     *src_line, *src;
4961     int src_stride, dst_stride;
4962     uint16_t w;
4963     uint32_t s, d;
4964
4965     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4966     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4967
4968     PIXMAN_IMAGE_GET_LINE (
4969         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4970     PIXMAN_IMAGE_GET_LINE (
4971         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4972
4973     while (height--)
4974     {
4975         dst = dst_line;
4976         dst_line += dst_stride;
4977         src = src_line;
4978         src_line += src_stride;
4979         w = width;
4980
4981         /* call prefetch hint to optimize cache load*/
4982         cache_prefetch ((__m128i*)src);
4983         cache_prefetch ((__m128i*)dst);
4984
4985         while (w && ((unsigned long)dst & 15))
4986         {
4987             s = (uint32_t) *src++;
4988             d = (uint32_t) *dst;
4989
4990             *dst++ = (uint8_t) pack_1x64_32 (
4991                 pix_multiply_1x64 (
4992                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4993             w--;
4994         }
4995
4996         /* call prefetch hint to optimize cache load*/
4997         cache_prefetch ((__m128i*)src);
4998         cache_prefetch ((__m128i*)dst);
4999
5000         while (w >= 16)
5001         {
5002             /* fill cache line with next memory */
5003             cache_prefetch_next ((__m128i*)src);
5004             cache_prefetch_next ((__m128i*)dst);
5005
5006             xmm_src = load_128_unaligned ((__m128i*)src);
5007             xmm_dst = load_128_aligned ((__m128i*)dst);
5008
5009             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5010             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5011
5012             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5013                                 &xmm_dst_lo, &xmm_dst_hi,
5014                                 &xmm_dst_lo, &xmm_dst_hi);
5015
5016             save_128_aligned (
5017                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5018
5019             src += 16;
5020             dst += 16;
5021             w -= 16;
5022         }
5023
5024         while (w)
5025         {
5026             s = (uint32_t) *src++;
5027             d = (uint32_t) *dst;
5028
5029             *dst++ = (uint8_t) pack_1x64_32 (
5030                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5031             w--;
5032         }
5033     }
5034
5035     _mm_empty ();
5036 }
5037
5038 /* -------------------------------------------------------------------------
5039  * composite_add_n_8_8
5040  */
5041
5042 static void
5043 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
5044                           pixman_op_t              op,
5045                           pixman_image_t *         src_image,
5046                           pixman_image_t *         mask_image,
5047                           pixman_image_t *         dst_image,
5048                           int32_t                  src_x,
5049                           int32_t                  src_y,
5050                           int32_t                  mask_x,
5051                           int32_t                  mask_y,
5052                           int32_t                  dest_x,
5053                           int32_t                  dest_y,
5054                           int32_t                  width,
5055                           int32_t                  height)
5056 {
5057     uint8_t     *dst_line, *dst;
5058     uint8_t     *mask_line, *mask;
5059     int dst_stride, mask_stride;
5060     uint16_t w;
5061     uint32_t src;
5062     uint8_t sa;
5063     uint32_t m, d;
5064
5065     __m128i xmm_alpha;
5066     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5067     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5068
5069     PIXMAN_IMAGE_GET_LINE (
5070         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5071     PIXMAN_IMAGE_GET_LINE (
5072         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5073
5074     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5075
5076     sa = src >> 24;
5077
5078     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5079
5080     while (height--)
5081     {
5082         dst = dst_line;
5083         dst_line += dst_stride;
5084         mask = mask_line;
5085         mask_line += mask_stride;
5086         w = width;
5087
5088         /* call prefetch hint to optimize cache load*/
5089         cache_prefetch ((__m128i*)mask);
5090         cache_prefetch ((__m128i*)dst);
5091
5092         while (w && ((unsigned long)dst & 15))
5093         {
5094             m = (uint32_t) *mask++;
5095             d = (uint32_t) *dst;
5096
5097             *dst++ = (uint8_t) pack_1x64_32 (
5098                 _mm_adds_pu16 (
5099                     pix_multiply_1x64 (
5100                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5101                     unpack_32_1x64 (d)));
5102             w--;
5103         }
5104
5105         /* call prefetch hint to optimize cache load*/
5106         cache_prefetch ((__m128i*)mask);
5107         cache_prefetch ((__m128i*)dst);
5108
5109         while (w >= 16)
5110         {
5111             /* fill cache line with next memory */
5112             cache_prefetch_next ((__m128i*)mask);
5113             cache_prefetch_next ((__m128i*)dst);
5114
5115             xmm_mask = load_128_unaligned ((__m128i*)mask);
5116             xmm_dst = load_128_aligned ((__m128i*)dst);
5117
5118             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5119             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5120
5121             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5122                                 &xmm_mask_lo, &xmm_mask_hi,
5123                                 &xmm_mask_lo, &xmm_mask_hi);
5124
5125             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5126             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5127
5128             save_128_aligned (
5129                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5130
5131             mask += 16;
5132             dst += 16;
5133             w -= 16;
5134         }
5135
5136         while (w)
5137         {
5138             m = (uint32_t) *mask++;
5139             d = (uint32_t) *dst;
5140
5141             *dst++ = (uint8_t) pack_1x64_32 (
5142                 _mm_adds_pu16 (
5143                     pix_multiply_1x64 (
5144                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5145                     unpack_32_1x64 (d)));
5146
5147             w--;
5148         }
5149     }
5150
5151     _mm_empty ();
5152 }
5153
5154 /* ----------------------------------------------------------------------
5155  * composite_add_8000_8000
5156  */
5157
5158 static void
5159 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5160                               pixman_op_t              op,
5161                               pixman_image_t *         src_image,
5162                               pixman_image_t *         mask_image,
5163                               pixman_image_t *         dst_image,
5164                               int32_t                  src_x,
5165                               int32_t                  src_y,
5166                               int32_t                  mask_x,
5167                               int32_t                  mask_y,
5168                               int32_t                  dest_x,
5169                               int32_t                  dest_y,
5170                               int32_t                  width,
5171                               int32_t                  height)
5172 {
5173     uint8_t     *dst_line, *dst;
5174     uint8_t     *src_line, *src;
5175     int dst_stride, src_stride;
5176     uint16_t w;
5177     uint16_t t;
5178
5179     PIXMAN_IMAGE_GET_LINE (
5180         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5181     PIXMAN_IMAGE_GET_LINE (
5182         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5183
5184     while (height--)
5185     {
5186         dst = dst_line;
5187         src = src_line;
5188
5189         /* call prefetch hint to optimize cache load*/
5190         cache_prefetch ((__m128i*)src);
5191         cache_prefetch ((__m128i*)dst);
5192
5193         dst_line += dst_stride;
5194         src_line += src_stride;
5195         w = width;
5196
5197         /* Small head */
5198         while (w && (unsigned long)dst & 3)
5199         {
5200             t = (*dst) + (*src++);
5201             *dst++ = t | (0 - (t >> 8));
5202             w--;
5203         }
5204
5205         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5206
5207         /* Small tail */
5208         dst += w & 0xfffc;
5209         src += w & 0xfffc;
5210
5211         w &= 3;
5212
5213         while (w)
5214         {
5215             t = (*dst) + (*src++);
5216             *dst++ = t | (0 - (t >> 8));
5217             w--;
5218         }
5219     }
5220
5221     _mm_empty ();
5222 }
5223
5224 /* ---------------------------------------------------------------------
5225  * composite_add_8888_8888
5226  */
5227 static void
5228 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5229                               pixman_op_t              op,
5230                               pixman_image_t *         src_image,
5231                               pixman_image_t *         mask_image,
5232                               pixman_image_t *         dst_image,
5233                               int32_t                  src_x,
5234                               int32_t                  src_y,
5235                               int32_t                  mask_x,
5236                               int32_t                  mask_y,
5237                               int32_t                  dest_x,
5238                               int32_t                  dest_y,
5239                               int32_t                  width,
5240                               int32_t                  height)
5241 {
5242     uint32_t    *dst_line, *dst;
5243     uint32_t    *src_line, *src;
5244     int dst_stride, src_stride;
5245
5246     PIXMAN_IMAGE_GET_LINE (
5247         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5248     PIXMAN_IMAGE_GET_LINE (
5249         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5250
5251     while (height--)
5252     {
5253         dst = dst_line;
5254         dst_line += dst_stride;
5255         src = src_line;
5256         src_line += src_stride;
5257
5258         core_combine_add_u_sse2 (dst, src, NULL, width);
5259     }
5260
5261     _mm_empty ();
5262 }
5263
5264 /* -------------------------------------------------------------------------------------------------
5265  * sse2_composite_copy_area
5266  */
5267
5268 static pixman_bool_t
5269 pixman_blt_sse2 (uint32_t *src_bits,
5270                  uint32_t *dst_bits,
5271                  int       src_stride,
5272                  int       dst_stride,
5273                  int       src_bpp,
5274                  int       dst_bpp,
5275                  int       src_x,
5276                  int       src_y,
5277                  int       dst_x,
5278                  int       dst_y,
5279                  int       width,
5280                  int       height)
5281 {
5282     uint8_t *   src_bytes;
5283     uint8_t *   dst_bytes;
5284     int byte_width;
5285
5286     if (src_bpp != dst_bpp)
5287         return FALSE;
5288
5289     if (src_bpp == 16)
5290     {
5291         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5292         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5293         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5294         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5295         byte_width = 2 * width;
5296         src_stride *= 2;
5297         dst_stride *= 2;
5298     }
5299     else if (src_bpp == 32)
5300     {
5301         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5302         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5303         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5304         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5305         byte_width = 4 * width;
5306         src_stride *= 4;
5307         dst_stride *= 4;
5308     }
5309     else
5310     {
5311         return FALSE;
5312     }
5313
5314     cache_prefetch ((__m128i*)src_bytes);
5315     cache_prefetch ((__m128i*)dst_bytes);
5316
5317     while (height--)
5318     {
5319         int w;
5320         uint8_t *s = src_bytes;
5321         uint8_t *d = dst_bytes;
5322         src_bytes += src_stride;
5323         dst_bytes += dst_stride;
5324         w = byte_width;
5325
5326         cache_prefetch_next ((__m128i*)s);
5327         cache_prefetch_next ((__m128i*)d);
5328
5329         while (w >= 2 && ((unsigned long)d & 3))
5330         {
5331             *(uint16_t *)d = *(uint16_t *)s;
5332             w -= 2;
5333             s += 2;
5334             d += 2;
5335         }
5336
5337         while (w >= 4 && ((unsigned long)d & 15))
5338         {
5339             *(uint32_t *)d = *(uint32_t *)s;
5340
5341             w -= 4;
5342             s += 4;
5343             d += 4;
5344         }
5345
5346         cache_prefetch_next ((__m128i*)s);
5347         cache_prefetch_next ((__m128i*)d);
5348
5349         while (w >= 64)
5350         {
5351             __m128i xmm0, xmm1, xmm2, xmm3;
5352
5353             /* 128 bytes ahead */
5354             cache_prefetch (((__m128i*)s) + 8);
5355             cache_prefetch (((__m128i*)d) + 8);
5356
5357             xmm0 = load_128_unaligned ((__m128i*)(s));
5358             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5359             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5360             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5361
5362             save_128_aligned ((__m128i*)(d),    xmm0);
5363             save_128_aligned ((__m128i*)(d + 16), xmm1);
5364             save_128_aligned ((__m128i*)(d + 32), xmm2);
5365             save_128_aligned ((__m128i*)(d + 48), xmm3);
5366
5367             s += 64;
5368             d += 64;
5369             w -= 64;
5370         }
5371
5372         cache_prefetch_next ((__m128i*)s);
5373         cache_prefetch_next ((__m128i*)d);
5374
5375         while (w >= 16)
5376         {
5377             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5378
5379             w -= 16;
5380             d += 16;
5381             s += 16;
5382         }
5383
5384         cache_prefetch_next ((__m128i*)s);
5385         cache_prefetch_next ((__m128i*)d);
5386
5387         while (w >= 4)
5388         {
5389             *(uint32_t *)d = *(uint32_t *)s;
5390
5391             w -= 4;
5392             s += 4;
5393             d += 4;
5394         }
5395
5396         if (w >= 2)
5397         {
5398             *(uint16_t *)d = *(uint16_t *)s;
5399             w -= 2;
5400             s += 2;
5401             d += 2;
5402         }
5403     }
5404
5405     _mm_empty ();
5406
5407     return TRUE;
5408 }
5409
5410 static void
5411 sse2_composite_copy_area (pixman_implementation_t *imp,
5412                           pixman_op_t              op,
5413                           pixman_image_t *         src_image,
5414                           pixman_image_t *         mask_image,
5415                           pixman_image_t *         dst_image,
5416                           int32_t                  src_x,
5417                           int32_t                  src_y,
5418                           int32_t                  mask_x,
5419                           int32_t                  mask_y,
5420                           int32_t                  dest_x,
5421                           int32_t                  dest_y,
5422                           int32_t                  width,
5423                           int32_t                  height)
5424 {
5425     pixman_blt_sse2 (src_image->bits.bits,
5426                      dst_image->bits.bits,
5427                      src_image->bits.rowstride,
5428                      dst_image->bits.rowstride,
5429                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5430                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5431                      src_x, src_y, dest_x, dest_y, width, height);
5432 }
5433
5434 static void
5435 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5436                                  pixman_op_t              op,
5437                                  pixman_image_t *         src_image,
5438                                  pixman_image_t *         mask_image,
5439                                  pixman_image_t *         dst_image,
5440                                  int32_t                  src_x,
5441                                  int32_t                  src_y,
5442                                  int32_t                  mask_x,
5443                                  int32_t                  mask_y,
5444                                  int32_t                  dest_x,
5445                                  int32_t                  dest_y,
5446                                  int32_t                  width,
5447                                  int32_t                  height)
5448 {
5449     uint32_t    *src, *src_line, s;
5450     uint32_t    *dst, *dst_line, d;
5451     uint8_t         *mask, *mask_line;
5452     uint32_t m;
5453     int src_stride, mask_stride, dst_stride;
5454     uint16_t w;
5455
5456     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5457     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5458     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5459
5460     PIXMAN_IMAGE_GET_LINE (
5461         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5462     PIXMAN_IMAGE_GET_LINE (
5463         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5464     PIXMAN_IMAGE_GET_LINE (
5465         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5466
5467     while (height--)
5468     {
5469         src = src_line;
5470         src_line += src_stride;
5471         dst = dst_line;
5472         dst_line += dst_stride;
5473         mask = mask_line;
5474         mask_line += mask_stride;
5475
5476         w = width;
5477
5478         /* call prefetch hint to optimize cache load*/
5479         cache_prefetch ((__m128i*)src);
5480         cache_prefetch ((__m128i*)dst);
5481         cache_prefetch ((__m128i*)mask);
5482
5483         while (w && (unsigned long)dst & 15)
5484         {
5485             s = 0xff000000 | *src++;
5486             m = (uint32_t) *mask++;
5487             d = *dst;
5488
5489             __m64 ms = unpack_32_1x64 (s);
5490
5491             if (m != 0xff)
5492             {
5493                 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5494                 __m64 md = unpack_32_1x64 (d);
5495
5496                 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5497             }
5498
5499             *dst++ = pack_1x64_32 (ms);
5500             w--;
5501         }
5502
5503         /* call prefetch hint to optimize cache load*/
5504         cache_prefetch ((__m128i*)src);
5505         cache_prefetch ((__m128i*)dst);
5506         cache_prefetch ((__m128i*)mask);
5507
5508         while (w >= 4)
5509         {
5510             /* fill cache line with next memory */
5511             cache_prefetch_next ((__m128i*)src);
5512             cache_prefetch_next ((__m128i*)dst);
5513             cache_prefetch_next ((__m128i*)mask);
5514
5515             m = *(uint32_t*) mask;
5516             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5517
5518             if (m == 0xffffffff)
5519             {
5520                 save_128_aligned ((__m128i*)dst, xmm_src);
5521             }
5522             else
5523             {
5524                 xmm_dst = load_128_aligned ((__m128i*)dst);
5525
5526                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5527
5528                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5529                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5530                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5531
5532                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5533
5534                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5535
5536                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5537             }
5538
5539             src += 4;
5540             dst += 4;
5541             mask += 4;
5542             w -= 4;
5543         }
5544
5545         while (w)
5546         {
5547             m = (uint32_t) *mask++;
5548
5549             if (m)
5550             {
5551                 s = 0xff000000 | *src;
5552
5553                 if (m == 0xff)
5554                 {
5555                     *dst = s;
5556                 }
5557                 else
5558                 {
5559                     __m64 ma, md, ms;
5560
5561                     d = *dst;
5562
5563                     ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5564                     md = unpack_32_1x64 (d);
5565                     ms = unpack_32_1x64 (s);
5566
5567                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5568                 }
5569
5570             }
5571
5572             src++;
5573             dst++;
5574             w--;
5575         }
5576     }
5577
5578     _mm_empty ();
5579 }
5580
5581 static const pixman_fast_path_t sse2_fast_paths[] =
5582 {
5583     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   sse2_composite_over_n_8_0565,       0 },
5584     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   sse2_composite_over_n_8_0565,       0 },
5585     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_n_8888,         0 },
5586     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_n_8888,         0 },
5587     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_n_0565,         0 },
5588     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888,      0 },
5589     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888,      0 },
5590     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888,      0 },
5591     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888,      0 },
5592     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_8888_0565,      0 },
5593     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_over_8888_0565,      0 },
5594     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888,       0 },
5595     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888,       0 },
5596     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888,       0 },
5597     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888,       0 },
5598     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
5599     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
5600     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
5601     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
5602     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5603     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5604     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5605     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5606     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5607     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5608     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5609     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5610     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5611     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5612     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5613     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5614     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5615     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5616     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5617     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5618     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5619     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5620     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5621     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5622     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5623     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5624     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5625     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5626     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5627     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5628     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5629     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5630
5631     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca,  NEED_COMPONENT_ALPHA },
5632     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       sse2_composite_add_8000_8000,       0 },
5633     { PIXMAN_OP_ADD,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888,       0 },
5634     { PIXMAN_OP_ADD,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888,       0 },
5635     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       sse2_composite_add_n_8_8,           0 },
5636
5637     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888,        0 },
5638     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888,        0 },
5639     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888,        0 },
5640     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888,        0 },
5641     { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_copy_area,           0 },
5642     { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_copy_area,           0 },
5643     { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5644     { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5645     { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5646     { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5647     { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_copy_area,           0 },
5648     { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_copy_area,           0 },
5649
5650     { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       sse2_composite_in_8_8,              0 },
5651     { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       sse2_composite_in_n_8_8,            0 },
5652
5653     { PIXMAN_OP_NONE },
5654 };
5655
5656 /*
5657  * Work around GCC bug causing crashes in Mozilla with SSE2
5658  *
5659  * When using -msse, gcc generates movdqa instructions assuming that
5660  * the stack is 16 byte aligned. Unfortunately some applications, such
5661  * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
5662  * causes the movdqa instructions to fail.
5663  *
5664  * The __force_align_arg_pointer__ makes gcc generate a prologue that
5665  * realigns the stack pointer to 16 bytes.
5666  *
5667  * On x86-64 this is not necessary because the standard ABI already
5668  * calls for a 16 byte aligned stack.
5669  *
5670  * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
5671  */
5672 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5673 __attribute__((__force_align_arg_pointer__))
5674 #endif
5675 static void
5676 sse2_composite (pixman_implementation_t *imp,
5677                 pixman_op_t              op,
5678                 pixman_image_t *         src,
5679                 pixman_image_t *         mask,
5680                 pixman_image_t *         dest,
5681                 int32_t                  src_x,
5682                 int32_t                  src_y,
5683                 int32_t                  mask_x,
5684                 int32_t                  mask_y,
5685                 int32_t                  dest_x,
5686                 int32_t                  dest_y,
5687                 int32_t                  width,
5688                 int32_t                  height)
5689 {
5690     if (_pixman_run_fast_path (sse2_fast_paths, imp,
5691                                op, src, mask, dest,
5692                                src_x, src_y,
5693                                mask_x, mask_y,
5694                                dest_x, dest_y,
5695                                width, height))
5696     {
5697         return;
5698     }
5699
5700     _pixman_implementation_composite (imp->delegate, op,
5701                                       src, mask, dest,
5702                                       src_x, src_y,
5703                                       mask_x, mask_y,
5704                                       dest_x, dest_y,
5705                                       width, height);
5706 }
5707
5708 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5709 __attribute__((__force_align_arg_pointer__))
5710 #endif
5711 static pixman_bool_t
5712 sse2_blt (pixman_implementation_t *imp,
5713           uint32_t *               src_bits,
5714           uint32_t *               dst_bits,
5715           int                      src_stride,
5716           int                      dst_stride,
5717           int                      src_bpp,
5718           int                      dst_bpp,
5719           int                      src_x,
5720           int                      src_y,
5721           int                      dst_x,
5722           int                      dst_y,
5723           int                      width,
5724           int                      height)
5725 {
5726     if (!pixman_blt_sse2 (
5727             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5728             src_x, src_y, dst_x, dst_y, width, height))
5729
5730     {
5731         return _pixman_implementation_blt (
5732             imp->delegate,
5733             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5734             src_x, src_y, dst_x, dst_y, width, height);
5735     }
5736
5737     return TRUE;
5738 }
5739
5740 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5741 __attribute__((__force_align_arg_pointer__))
5742 #endif
5743 static pixman_bool_t
5744 sse2_fill (pixman_implementation_t *imp,
5745            uint32_t *               bits,
5746            int                      stride,
5747            int                      bpp,
5748            int                      x,
5749            int                      y,
5750            int                      width,
5751            int                      height,
5752            uint32_t xor)
5753 {
5754     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5755     {
5756         return _pixman_implementation_fill (
5757             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5758     }
5759
5760     return TRUE;
5761 }
5762
5763 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5764 __attribute__((__force_align_arg_pointer__))
5765 #endif
5766 pixman_implementation_t *
5767 _pixman_implementation_create_sse2 (void)
5768 {
5769     pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5770     pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5771
5772     /* SSE2 constants */
5773     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5774     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5775     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5776     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5777     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5778     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5779     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5780     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5781     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
5782     mask_0080 = create_mask_16_128 (0x0080);
5783     mask_00ff = create_mask_16_128 (0x00ff);
5784     mask_0101 = create_mask_16_128 (0x0101);
5785     mask_ffff = create_mask_16_128 (0xffff);
5786     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5787     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5788
5789     /* MMX constants */
5790     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5791     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5792
5793     mask_x0080 = create_mask_16_64 (0x0080);
5794     mask_x00ff = create_mask_16_64 (0x00ff);
5795     mask_x0101 = create_mask_16_64 (0x0101);
5796     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5797
5798     _mm_empty ();
5799
5800     /* Set up function pointers */
5801
5802     /* SSE code patch for fbcompose.c */
5803     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5804     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5805     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5806     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5807     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5808     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5809     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5810     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5811     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5812     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5813
5814     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5815
5816     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5817     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5818     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5819     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5820     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5821     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5822     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5823     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5824     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5825     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5826     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5827
5828     imp->composite = sse2_composite;
5829     imp->blt = sse2_blt;
5830     imp->fill = sse2_fill;
5831
5832     return imp;
5833 }
5834
5835 #endif /* USE_SSE2 */