[sse2] Add sse2_composite_src_x888_8888()
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41  * the pixman-x64-mmx-emulation.h file contains
42  * implementations of those MMX intrinsics that
43  * are used in the SSE2 implementation.
44  */
45 #   include "pixman-x64-mmx-emulation.h"
46 #endif
47
48 #ifdef USE_SSE2
49
50 /* --------------------------------------------------------------------
51  * Locals
52  */
53
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
58
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
61
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
68
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
75
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
78
79 /* ----------------------------------------------------------------------
80  * SSE2 Inlines
81  */
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
84 {
85     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
86 }
87
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
90 {
91     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
93 }
94
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
97 {
98     __m128i r, g, b, rb, t;
99
100     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
103
104     rb = _mm_or_si128 (r, b);
105     t  = _mm_and_si128 (rb, mask_565_fix_rb);
106     t  = _mm_srli_epi32 (t, 5);
107     rb = _mm_or_si128 (rb, t);
108
109     t  = _mm_and_si128 (g, mask_565_fix_g);
110     t  = _mm_srli_epi32 (t, 6);
111     g  = _mm_or_si128 (g, t);
112
113     return _mm_or_si128 (rb, g);
114 }
115
116 static force_inline void
117 unpack_565_128_4x128 (__m128i  data,
118                       __m128i* data0,
119                       __m128i* data1,
120                       __m128i* data2,
121                       __m128i* data3)
122 {
123     __m128i lo, hi;
124
125     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
127
128     lo = unpack_565_to_8888 (lo);
129     hi = unpack_565_to_8888 (hi);
130
131     unpack_128_2x128 (lo, data0, data1);
132     unpack_128_2x128 (hi, data2, data3);
133 }
134
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
137 {
138     return (uint16_t) (((pixel >> 8) & 0xf800) |
139                        ((pixel >> 5) & 0x07e0) |
140                        ((pixel >> 3) & 0x001f));
141 }
142
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
145 {
146     return _mm_packus_epi16 (lo, hi);
147 }
148
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
151 {
152     __m128i data;
153     __m128i r, g1, g2, b;
154
155     data = pack_2x128_128 (lo, hi);
156
157     r  = _mm_and_si128 (data, mask_565_r);
158     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
161
162     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
163 }
164
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
167 {
168     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169                              pack_565_2x128_128 (*xmm2, *xmm3));
170 }
171
172 static force_inline int
173 is_opaque (__m128i x)
174 {
175     __m128i ffs = _mm_cmpeq_epi8 (x, x);
176
177     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
178 }
179
180 static force_inline int
181 is_zero (__m128i x)
182 {
183     return _mm_movemask_epi8 (
184         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
185 }
186
187 static force_inline int
188 is_transparent (__m128i x)
189 {
190     return (_mm_movemask_epi8 (
191                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
192 }
193
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
196 {
197     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
198 }
199
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
202 {
203     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204                                                      _MM_SHUFFLE (3, 3, 3, 3)),
205                                 _MM_SHUFFLE (3, 3, 3, 3));
206 }
207
208 static force_inline void
209 expand_alpha_2x128 (__m128i  data_lo,
210                     __m128i  data_hi,
211                     __m128i* alpha_lo,
212                     __m128i* alpha_hi)
213 {
214     __m128i lo, hi;
215
216     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
218
219     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
221 }
222
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i  data_lo,
225                         __m128i  data_hi,
226                         __m128i* alpha_lo,
227                         __m128i* alpha_hi)
228 {
229     __m128i lo, hi;
230
231     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
235 }
236
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
239                     __m128i* data_hi,
240                     __m128i* alpha_lo,
241                     __m128i* alpha_hi,
242                     __m128i* ret_lo,
243                     __m128i* ret_hi)
244 {
245     __m128i lo, hi;
246
247     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249     lo = _mm_adds_epu16 (lo, mask_0080);
250     hi = _mm_adds_epu16 (hi, mask_0080);
251     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
253 }
254
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
257                         __m128i* src_hi,
258                         __m128i* alpha_dst_lo,
259                         __m128i* alpha_dst_hi,
260                         __m128i* dst_lo,
261                         __m128i* dst_hi,
262                         __m128i* alpha_src_lo,
263                         __m128i* alpha_src_hi,
264                         __m128i* ret_lo,
265                         __m128i* ret_hi)
266 {
267     __m128i t1_lo, t1_hi;
268     __m128i t2_lo, t2_hi;
269
270     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
272
273     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
275 }
276
277 static force_inline void
278 negate_2x128 (__m128i  data_lo,
279               __m128i  data_hi,
280               __m128i* neg_lo,
281               __m128i* neg_hi)
282 {
283     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
285 }
286
287 static force_inline void
288 invert_colors_2x128 (__m128i  data_lo,
289                      __m128i  data_hi,
290                      __m128i* inv_lo,
291                      __m128i* inv_hi)
292 {
293     __m128i lo, hi;
294
295     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
299 }
300
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
303             __m128i* src_hi,
304             __m128i* alpha_lo,
305             __m128i* alpha_hi,
306             __m128i* dst_lo,
307             __m128i* dst_hi)
308 {
309     __m128i t1, t2;
310
311     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
312
313     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
314
315     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
317 }
318
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i  src_lo,
321                         __m128i  src_hi,
322                         __m128i* dst_lo,
323                         __m128i* dst_hi)
324 {
325     __m128i lo, hi;
326     __m128i alpha_lo, alpha_hi;
327
328     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
329
330     lo = _mm_or_si128 (alpha_lo, mask_alpha);
331     hi = _mm_or_si128 (alpha_hi, mask_alpha);
332
333     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
334
335     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
336
337     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
338 }
339
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
342                __m128i* src_hi,
343                __m128i* alpha_lo,
344                __m128i* alpha_hi,
345                __m128i* mask_lo,
346                __m128i* mask_hi,
347                __m128i* dst_lo,
348                __m128i* dst_hi)
349 {
350     __m128i s_lo, s_hi;
351     __m128i a_lo, a_hi;
352
353     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
355
356     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
357 }
358
359 static force_inline void
360 cache_prefetch (__m128i* addr)
361 {
362     _mm_prefetch ((void const*)addr, _MM_HINT_T0);
363 }
364
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
367 {
368     _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
369 }
370
371 /* prefetching NULL is very slow on some systems. don't do that. */
372
373 static force_inline void
374 maybe_prefetch (__m128i* addr)
375 {
376     if (addr)
377         cache_prefetch (addr);
378 }
379
380 static force_inline void
381 maybe_prefetch_next (__m128i* addr)
382 {
383     if (addr)
384         cache_prefetch_next (addr);
385 }
386
387 /* load 4 pixels from a 16-byte boundary aligned address */
388 static force_inline __m128i
389 load_128_aligned (__m128i* src)
390 {
391     return _mm_load_si128 (src);
392 }
393
394 /* load 4 pixels from a unaligned address */
395 static force_inline __m128i
396 load_128_unaligned (const __m128i* src)
397 {
398     return _mm_loadu_si128 (src);
399 }
400
401 /* save 4 pixels using Write Combining memory on a 16-byte
402  * boundary aligned address
403  */
404 static force_inline void
405 save_128_write_combining (__m128i* dst,
406                           __m128i  data)
407 {
408     _mm_stream_si128 (dst, data);
409 }
410
411 /* save 4 pixels on a 16-byte boundary aligned address */
412 static force_inline void
413 save_128_aligned (__m128i* dst,
414                   __m128i  data)
415 {
416     _mm_store_si128 (dst, data);
417 }
418
419 /* save 4 pixels on a unaligned address */
420 static force_inline void
421 save_128_unaligned (__m128i* dst,
422                     __m128i  data)
423 {
424     _mm_storeu_si128 (dst, data);
425 }
426
427 /* ------------------------------------------------------------------
428  * MMX inlines
429  */
430
431 static force_inline __m64
432 load_32_1x64 (uint32_t data)
433 {
434     return _mm_cvtsi32_si64 (data);
435 }
436
437 static force_inline __m64
438 unpack_32_1x64 (uint32_t data)
439 {
440     return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
441 }
442
443 static force_inline __m64
444 expand_alpha_1x64 (__m64 data)
445 {
446     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
447 }
448
449 static force_inline __m64
450 expand_alpha_rev_1x64 (__m64 data)
451 {
452     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
453 }
454
455 static force_inline __m64
456 expand_pixel_8_1x64 (uint8_t data)
457 {
458     return _mm_shuffle_pi16 (
459         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
460 }
461
462 static force_inline __m64
463 pix_multiply_1x64 (__m64 data,
464                    __m64 alpha)
465 {
466     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
467                                           mask_x0080),
468                            mask_x0101);
469 }
470
471 static force_inline __m64
472 pix_add_multiply_1x64 (__m64* src,
473                        __m64* alpha_dst,
474                        __m64* dst,
475                        __m64* alpha_src)
476 {
477     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
478     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
479
480     return _mm_adds_pu8 (t1, t2);
481 }
482
483 static force_inline __m64
484 negate_1x64 (__m64 data)
485 {
486     return _mm_xor_si64 (data, mask_x00ff);
487 }
488
489 static force_inline __m64
490 invert_colors_1x64 (__m64 data)
491 {
492     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
493 }
494
495 static force_inline __m64
496 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
497 {
498     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
499 }
500
501 static force_inline __m64
502 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
503 {
504     return over_1x64 (pix_multiply_1x64 (*src, *mask),
505                       pix_multiply_1x64 (*alpha, *mask),
506                       *dst);
507 }
508
509 static force_inline __m64
510 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
511 {
512     __m64 alpha = expand_alpha_1x64 (src);
513
514     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
515                                          _mm_or_si64 (alpha, mask_x_alpha)),
516                       alpha,
517                       dst);
518 }
519
520 static force_inline uint32_t
521 pack_1x64_32 (__m64 data)
522 {
523     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
524 }
525
526 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
527  *
528  *    00RR00GG00BB
529  *
530  * --- Expanding 565 in the low word ---
531  *
532  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
533  * m = m & (01f0003f001f);
534  * m = m * (008404100840);
535  * m = m >> 8;
536  *
537  * Note the trick here - the top word is shifted by another nibble to
538  * avoid it bumping into the middle word
539  */
540 static force_inline __m64
541 expand565_16_1x64 (uint16_t pixel)
542 {
543     __m64 p;
544     __m64 t1, t2;
545
546     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
547
548     t1 = _mm_slli_si64 (p, 36 - 11);
549     t2 = _mm_slli_si64 (p, 16 - 5);
550
551     p = _mm_or_si64 (t1, p);
552     p = _mm_or_si64 (t2, p);
553     p = _mm_and_si64 (p, mask_x565_rgb);
554     p = _mm_mullo_pi16 (p, mask_x565_unpack);
555
556     return _mm_srli_pi16 (p, 8);
557 }
558
559 /* ----------------------------------------------------------------------------
560  * Compose Core transformations
561  */
562 static force_inline uint32_t
563 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
564 {
565     uint8_t a;
566     __m64 ms;
567
568     a = src >> 24;
569
570     if (a == 0xff)
571     {
572         return src;
573     }
574     else if (src)
575     {
576         ms = unpack_32_1x64 (src);
577         return pack_1x64_32 (
578             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
579     }
580
581     return dst;
582 }
583
584 static force_inline uint32_t
585 combine1 (const uint32_t *ps, const uint32_t *pm)
586 {
587     uint32_t s = *ps;
588
589     if (pm)
590     {
591         __m64 ms, mm;
592
593         mm = unpack_32_1x64 (*pm);
594         mm = expand_alpha_1x64 (mm);
595
596         ms = unpack_32_1x64 (s);
597         ms = pix_multiply_1x64 (ms, mm);
598
599         s = pack_1x64_32 (ms);
600     }
601
602     return s;
603 }
604
605 static force_inline __m128i
606 combine4 (const __m128i *ps, const __m128i *pm)
607 {
608     __m128i xmm_src_lo, xmm_src_hi;
609     __m128i xmm_msk_lo, xmm_msk_hi;
610     __m128i s;
611
612     if (pm)
613     {
614         xmm_msk_lo = load_128_unaligned (pm);
615
616         if (is_transparent (xmm_msk_lo))
617             return _mm_setzero_si128 ();
618     }
619
620     s = load_128_unaligned (ps);
621
622     if (pm)
623     {
624         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
625         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
626
627         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
628
629         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
630                             &xmm_msk_lo, &xmm_msk_hi,
631                             &xmm_src_lo, &xmm_src_hi);
632
633         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
634     }
635
636     return s;
637 }
638
639 static force_inline void
640 core_combine_over_u_sse2 (uint32_t*       pd,
641                           const uint32_t* ps,
642                           const uint32_t* pm,
643                           int             w)
644 {
645     uint32_t s, d;
646
647     __m128i xmm_dst_lo, xmm_dst_hi;
648     __m128i xmm_src_lo, xmm_src_hi;
649     __m128i xmm_alpha_lo, xmm_alpha_hi;
650
651     /* call prefetch hint to optimize cache load*/
652     cache_prefetch ((__m128i*)ps);
653     cache_prefetch ((__m128i*)pd);
654     maybe_prefetch ((__m128i*)pm);
655
656     /* Align dst on a 16-byte boundary */
657     while (w && ((unsigned long)pd & 15))
658     {
659         d = *pd;
660         s = combine1 (ps, pm);
661
662         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
663         ps++;
664         if (pm)
665             pm++;
666         w--;
667     }
668
669     /* call prefetch hint to optimize cache load*/
670     cache_prefetch ((__m128i*)ps);
671     cache_prefetch ((__m128i*)pd);
672     maybe_prefetch ((__m128i*)pm);
673
674     while (w >= 4)
675     {
676         /* fill cache line with next memory */
677         cache_prefetch_next ((__m128i*)ps);
678         cache_prefetch_next ((__m128i*)pd);
679         maybe_prefetch_next ((__m128i*)pm);
680
681         /* I'm loading unaligned because I'm not sure about
682          * the address alignment.
683          */
684         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
685
686         if (is_opaque (xmm_src_hi))
687         {
688             save_128_aligned ((__m128i*)pd, xmm_src_hi);
689         }
690         else if (!is_zero (xmm_src_hi))
691         {
692             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
693
694             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
695             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
696
697             expand_alpha_2x128 (
698                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
699
700             over_2x128 (&xmm_src_lo, &xmm_src_hi,
701                         &xmm_alpha_lo, &xmm_alpha_hi,
702                         &xmm_dst_lo, &xmm_dst_hi);
703
704             /* rebuid the 4 pixel data and save*/
705             save_128_aligned ((__m128i*)pd,
706                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
707         }
708
709         w -= 4;
710         ps += 4;
711         pd += 4;
712         if (pm)
713             pm += 4;
714     }
715
716     while (w)
717     {
718         d = *pd;
719         s = combine1 (ps, pm);
720
721         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
722         ps++;
723         if (pm)
724             pm++;
725
726         w--;
727     }
728 }
729
730 static force_inline void
731 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
732                                   const uint32_t* ps,
733                                   const uint32_t* pm,
734                                   int             w)
735 {
736     uint32_t s, d;
737
738     __m128i xmm_dst_lo, xmm_dst_hi;
739     __m128i xmm_src_lo, xmm_src_hi;
740     __m128i xmm_alpha_lo, xmm_alpha_hi;
741
742     /* call prefetch hint to optimize cache load*/
743     cache_prefetch ((__m128i*)ps);
744     cache_prefetch ((__m128i*)pd);
745     maybe_prefetch ((__m128i*)pm);
746
747     /* Align dst on a 16-byte boundary */
748     while (w &&
749            ((unsigned long)pd & 15))
750     {
751         d = *pd;
752         s = combine1 (ps, pm);
753
754         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
755         w--;
756         ps++;
757         if (pm)
758             pm++;
759     }
760
761     /* call prefetch hint to optimize cache load*/
762     cache_prefetch ((__m128i*)ps);
763     cache_prefetch ((__m128i*)pd);
764     maybe_prefetch ((__m128i*)pm);
765
766     while (w >= 4)
767     {
768         /* fill cache line with next memory */
769         cache_prefetch_next ((__m128i*)ps);
770         cache_prefetch_next ((__m128i*)pd);
771         maybe_prefetch_next ((__m128i*)pm);
772
773         /* I'm loading unaligned because I'm not sure
774          * about the address alignment.
775          */
776         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
777         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
778
779         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
780         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
781
782         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
783                             &xmm_alpha_lo, &xmm_alpha_hi);
784
785         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
786                     &xmm_alpha_lo, &xmm_alpha_hi,
787                     &xmm_src_lo, &xmm_src_hi);
788
789         /* rebuid the 4 pixel data and save*/
790         save_128_aligned ((__m128i*)pd,
791                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
792
793         w -= 4;
794         ps += 4;
795         pd += 4;
796
797         if (pm)
798             pm += 4;
799     }
800
801     while (w)
802     {
803         d = *pd;
804         s = combine1 (ps, pm);
805
806         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
807         ps++;
808         w--;
809         if (pm)
810             pm++;
811     }
812 }
813
814 static force_inline uint32_t
815 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
816 {
817     uint32_t maska = src >> 24;
818
819     if (maska == 0)
820     {
821         return 0;
822     }
823     else if (maska != 0xff)
824     {
825         return pack_1x64_32 (
826             pix_multiply_1x64 (unpack_32_1x64 (dst),
827                                expand_alpha_1x64 (unpack_32_1x64 (src))));
828     }
829
830     return dst;
831 }
832
833 static force_inline void
834 core_combine_in_u_sse2 (uint32_t*       pd,
835                         const uint32_t* ps,
836                         const uint32_t* pm,
837                         int             w)
838 {
839     uint32_t s, d;
840
841     __m128i xmm_src_lo, xmm_src_hi;
842     __m128i xmm_dst_lo, xmm_dst_hi;
843
844     /* call prefetch hint to optimize cache load*/
845     cache_prefetch ((__m128i*)ps);
846     cache_prefetch ((__m128i*)pd);
847     maybe_prefetch ((__m128i*)pm);
848
849     while (w && ((unsigned long) pd & 15))
850     {
851         s = combine1 (ps, pm);
852         d = *pd;
853
854         *pd++ = core_combine_in_u_pixelsse2 (d, s);
855         w--;
856         ps++;
857         if (pm)
858             pm++;
859     }
860
861     /* call prefetch hint to optimize cache load*/
862     cache_prefetch ((__m128i*)ps);
863     cache_prefetch ((__m128i*)pd);
864     maybe_prefetch ((__m128i*)pm);
865
866     while (w >= 4)
867     {
868         /* fill cache line with next memory */
869         cache_prefetch_next ((__m128i*)ps);
870         cache_prefetch_next ((__m128i*)pd);
871         maybe_prefetch_next ((__m128i*)pm);
872
873         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
874         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
875
876         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
877         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
878
879         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
880         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
881                             &xmm_dst_lo, &xmm_dst_hi,
882                             &xmm_dst_lo, &xmm_dst_hi);
883
884         save_128_aligned ((__m128i*)pd,
885                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
886
887         ps += 4;
888         pd += 4;
889         w -= 4;
890         if (pm)
891             pm += 4;
892     }
893
894     while (w)
895     {
896         s = combine1 (ps, pm);
897         d = *pd;
898
899         *pd++ = core_combine_in_u_pixelsse2 (d, s);
900         w--;
901         ps++;
902         if (pm)
903             pm++;
904     }
905 }
906
907 static force_inline void
908 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
909                                 const uint32_t* ps,
910                                 const uint32_t *pm,
911                                 int             w)
912 {
913     uint32_t s, d;
914
915     __m128i xmm_src_lo, xmm_src_hi;
916     __m128i xmm_dst_lo, xmm_dst_hi;
917
918     /* call prefetch hint to optimize cache load*/
919     cache_prefetch ((__m128i*)ps);
920     cache_prefetch ((__m128i*)pd);
921     maybe_prefetch ((__m128i*)pm);
922
923     while (w && ((unsigned long) pd & 15))
924     {
925         s = combine1 (ps, pm);
926         d = *pd;
927
928         *pd++ = core_combine_in_u_pixelsse2 (s, d);
929         ps++;
930         w--;
931         if (pm)
932             pm++;
933     }
934
935     /* call prefetch hint to optimize cache load*/
936     cache_prefetch ((__m128i*)ps);
937     cache_prefetch ((__m128i*)pd);
938     maybe_prefetch ((__m128i*)pm);
939
940     while (w >= 4)
941     {
942         /* fill cache line with next memory */
943         cache_prefetch_next ((__m128i*)ps);
944         cache_prefetch_next ((__m128i*)pd);
945         maybe_prefetch_next ((__m128i*)pm);
946
947         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
948         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
949
950         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
951         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
952
953         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
954         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
955                             &xmm_src_lo, &xmm_src_hi,
956                             &xmm_dst_lo, &xmm_dst_hi);
957
958         save_128_aligned (
959             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
960
961         ps += 4;
962         pd += 4;
963         w -= 4;
964         if (pm)
965             pm += 4;
966     }
967
968     while (w)
969     {
970         s = combine1 (ps, pm);
971         d = *pd;
972
973         *pd++ = core_combine_in_u_pixelsse2 (s, d);
974         w--;
975         ps++;
976         if (pm)
977             pm++;
978     }
979 }
980
981 static force_inline void
982 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
983                                  const uint32_t* ps,
984                                  const uint32_t* pm,
985                                  int             w)
986 {
987     /* call prefetch hint to optimize cache load*/
988     cache_prefetch ((__m128i*)ps);
989     cache_prefetch ((__m128i*)pd);
990     maybe_prefetch ((__m128i*)pm);
991
992     while (w && ((unsigned long) pd & 15))
993     {
994         uint32_t s = combine1 (ps, pm);
995         uint32_t d = *pd;
996
997         *pd++ = pack_1x64_32 (
998             pix_multiply_1x64 (
999                 unpack_32_1x64 (d), negate_1x64 (
1000                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1001
1002         if (pm)
1003             pm++;
1004         ps++;
1005         w--;
1006     }
1007
1008     /* call prefetch hint to optimize cache load*/
1009     cache_prefetch ((__m128i*)ps);
1010     cache_prefetch ((__m128i*)pd);
1011     maybe_prefetch ((__m128i*)pm);
1012
1013     while (w >= 4)
1014     {
1015         __m128i xmm_src_lo, xmm_src_hi;
1016         __m128i xmm_dst_lo, xmm_dst_hi;
1017
1018         /* fill cache line with next memory */
1019         cache_prefetch_next ((__m128i*)ps);
1020         cache_prefetch_next ((__m128i*)pd);
1021         maybe_prefetch_next ((__m128i*)pm);
1022
1023         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1024         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1025
1026         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1027         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1028
1029         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1030         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1031
1032         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1033                             &xmm_src_lo, &xmm_src_hi,
1034                             &xmm_dst_lo, &xmm_dst_hi);
1035
1036         save_128_aligned (
1037             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1038
1039         ps += 4;
1040         pd += 4;
1041         if (pm)
1042             pm += 4;
1043
1044         w -= 4;
1045     }
1046
1047     while (w)
1048     {
1049         uint32_t s = combine1 (ps, pm);
1050         uint32_t d = *pd;
1051
1052         *pd++ = pack_1x64_32 (
1053             pix_multiply_1x64 (
1054                 unpack_32_1x64 (d), negate_1x64 (
1055                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1056         ps++;
1057         if (pm)
1058             pm++;
1059         w--;
1060     }
1061 }
1062
1063 static force_inline void
1064 core_combine_out_u_sse2 (uint32_t*       pd,
1065                          const uint32_t* ps,
1066                          const uint32_t* pm,
1067                          int             w)
1068 {
1069     /* call prefetch hint to optimize cache load*/
1070     cache_prefetch ((__m128i*)ps);
1071     cache_prefetch ((__m128i*)pd);
1072     maybe_prefetch ((__m128i*)pm);
1073
1074     while (w && ((unsigned long) pd & 15))
1075     {
1076         uint32_t s = combine1 (ps, pm);
1077         uint32_t d = *pd;
1078
1079         *pd++ = pack_1x64_32 (
1080             pix_multiply_1x64 (
1081                 unpack_32_1x64 (s), negate_1x64 (
1082                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1083         w--;
1084         ps++;
1085         if (pm)
1086             pm++;
1087     }
1088
1089     /* call prefetch hint to optimize cache load*/
1090     cache_prefetch ((__m128i*)ps);
1091     cache_prefetch ((__m128i*)pd);
1092     maybe_prefetch ((__m128i*)pm);
1093
1094     while (w >= 4)
1095     {
1096         __m128i xmm_src_lo, xmm_src_hi;
1097         __m128i xmm_dst_lo, xmm_dst_hi;
1098
1099         /* fill cache line with next memory */
1100         cache_prefetch_next ((__m128i*)ps);
1101         cache_prefetch_next ((__m128i*)pd);
1102         maybe_prefetch_next ((__m128i*)pm);
1103
1104         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1105         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1106
1107         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1108         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1109
1110         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1111         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1112
1113         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1114                             &xmm_dst_lo, &xmm_dst_hi,
1115                             &xmm_dst_lo, &xmm_dst_hi);
1116
1117         save_128_aligned (
1118             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1119
1120         ps += 4;
1121         pd += 4;
1122         w -= 4;
1123         if (pm)
1124             pm += 4;
1125     }
1126
1127     while (w)
1128     {
1129         uint32_t s = combine1 (ps, pm);
1130         uint32_t d = *pd;
1131
1132         *pd++ = pack_1x64_32 (
1133             pix_multiply_1x64 (
1134                 unpack_32_1x64 (s), negate_1x64 (
1135                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1136         w--;
1137         ps++;
1138         if (pm)
1139             pm++;
1140     }
1141 }
1142
1143 static force_inline uint32_t
1144 core_combine_atop_u_pixel_sse2 (uint32_t src,
1145                                 uint32_t dst)
1146 {
1147     __m64 s = unpack_32_1x64 (src);
1148     __m64 d = unpack_32_1x64 (dst);
1149
1150     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1151     __m64 da = expand_alpha_1x64 (d);
1152
1153     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1154 }
1155
1156 static force_inline void
1157 core_combine_atop_u_sse2 (uint32_t*       pd,
1158                           const uint32_t* ps,
1159                           const uint32_t* pm,
1160                           int             w)
1161 {
1162     uint32_t s, d;
1163
1164     __m128i xmm_src_lo, xmm_src_hi;
1165     __m128i xmm_dst_lo, xmm_dst_hi;
1166     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1167     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1168
1169     /* call prefetch hint to optimize cache load*/
1170     cache_prefetch ((__m128i*)ps);
1171     cache_prefetch ((__m128i*)pd);
1172     maybe_prefetch ((__m128i*)pm);
1173
1174     while (w && ((unsigned long) pd & 15))
1175     {
1176         s = combine1 (ps, pm);
1177         d = *pd;
1178
1179         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1180         w--;
1181         ps++;
1182         if (pm)
1183             pm++;
1184     }
1185
1186     /* call prefetch hint to optimize cache load*/
1187     cache_prefetch ((__m128i*)ps);
1188     cache_prefetch ((__m128i*)pd);
1189     maybe_prefetch ((__m128i*)pm);
1190
1191     while (w >= 4)
1192     {
1193         /* fill cache line with next memory */
1194         cache_prefetch_next ((__m128i*)ps);
1195         cache_prefetch_next ((__m128i*)pd);
1196         maybe_prefetch_next ((__m128i*)pm);
1197
1198         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1199         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1200
1201         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1202         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1203
1204         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1205                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1206         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1207                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1208
1209         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1210                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1211
1212         pix_add_multiply_2x128 (
1213             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1214             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1215             &xmm_dst_lo, &xmm_dst_hi);
1216
1217         save_128_aligned (
1218             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1219
1220         ps += 4;
1221         pd += 4;
1222         w -= 4;
1223         if (pm)
1224             pm += 4;
1225     }
1226
1227     while (w)
1228     {
1229         s = combine1 (ps, pm);
1230         d = *pd;
1231
1232         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1233         w--;
1234         ps++;
1235         if (pm)
1236             pm++;
1237     }
1238 }
1239
1240 static force_inline uint32_t
1241 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1242                                         uint32_t dst)
1243 {
1244     __m64 s = unpack_32_1x64 (src);
1245     __m64 d = unpack_32_1x64 (dst);
1246
1247     __m64 sa = expand_alpha_1x64 (s);
1248     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1249
1250     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1251 }
1252
1253 static force_inline void
1254 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1255                                   const uint32_t* ps,
1256                                   const uint32_t* pm,
1257                                   int             w)
1258 {
1259     uint32_t s, d;
1260
1261     __m128i xmm_src_lo, xmm_src_hi;
1262     __m128i xmm_dst_lo, xmm_dst_hi;
1263     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1264     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1265
1266     /* call prefetch hint to optimize cache load*/
1267     cache_prefetch ((__m128i*)ps);
1268     cache_prefetch ((__m128i*)pd);
1269     maybe_prefetch ((__m128i*)pm);
1270
1271     while (w && ((unsigned long) pd & 15))
1272     {
1273         s = combine1 (ps, pm);
1274         d = *pd;
1275
1276         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1277         ps++;
1278         w--;
1279         if (pm)
1280             pm++;
1281     }
1282
1283     /* call prefetch hint to optimize cache load*/
1284     cache_prefetch ((__m128i*)ps);
1285     cache_prefetch ((__m128i*)pd);
1286     maybe_prefetch ((__m128i*)pm);
1287
1288     while (w >= 4)
1289     {
1290         /* fill cache line with next memory */
1291         cache_prefetch_next ((__m128i*)ps);
1292         cache_prefetch_next ((__m128i*)pd);
1293         maybe_prefetch_next ((__m128i*)pm);
1294
1295         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1296         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1297
1298         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1299         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1300
1301         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1302                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1303         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1304                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1305
1306         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1307                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1308
1309         pix_add_multiply_2x128 (
1310             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1311             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1312             &xmm_dst_lo, &xmm_dst_hi);
1313
1314         save_128_aligned (
1315             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1316
1317         ps += 4;
1318         pd += 4;
1319         w -= 4;
1320         if (pm)
1321             pm += 4;
1322     }
1323
1324     while (w)
1325     {
1326         s = combine1 (ps, pm);
1327         d = *pd;
1328
1329         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1330         ps++;
1331         w--;
1332         if (pm)
1333             pm++;
1334     }
1335 }
1336
1337 static force_inline uint32_t
1338 core_combine_xor_u_pixel_sse2 (uint32_t src,
1339                                uint32_t dst)
1340 {
1341     __m64 s = unpack_32_1x64 (src);
1342     __m64 d = unpack_32_1x64 (dst);
1343
1344     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1345     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1346
1347     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1348 }
1349
1350 static force_inline void
1351 core_combine_xor_u_sse2 (uint32_t*       dst,
1352                          const uint32_t* src,
1353                          const uint32_t *mask,
1354                          int             width)
1355 {
1356     int w = width;
1357     uint32_t s, d;
1358     uint32_t* pd = dst;
1359     const uint32_t* ps = src;
1360     const uint32_t* pm = mask;
1361
1362     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1363     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1364     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1365     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1366
1367     /* call prefetch hint to optimize cache load*/
1368     cache_prefetch ((__m128i*)ps);
1369     cache_prefetch ((__m128i*)pd);
1370     maybe_prefetch ((__m128i*)pm);
1371
1372     while (w && ((unsigned long) pd & 15))
1373     {
1374         s = combine1 (ps, pm);
1375         d = *pd;
1376
1377         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1378         w--;
1379         ps++;
1380         if (pm)
1381             pm++;
1382     }
1383
1384     /* call prefetch hint to optimize cache load*/
1385     cache_prefetch ((__m128i*)ps);
1386     cache_prefetch ((__m128i*)pd);
1387     maybe_prefetch ((__m128i*)pm);
1388
1389     while (w >= 4)
1390     {
1391         /* fill cache line with next memory */
1392         cache_prefetch_next ((__m128i*)ps);
1393         cache_prefetch_next ((__m128i*)pd);
1394         maybe_prefetch_next ((__m128i*)pm);
1395
1396         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1397         xmm_dst = load_128_aligned ((__m128i*) pd);
1398
1399         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1400         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1401
1402         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1403                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1404         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1405                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1406
1407         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1408                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1409         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1410                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1411
1412         pix_add_multiply_2x128 (
1413             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1414             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1415             &xmm_dst_lo, &xmm_dst_hi);
1416
1417         save_128_aligned (
1418             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1419
1420         ps += 4;
1421         pd += 4;
1422         w -= 4;
1423         if (pm)
1424             pm += 4;
1425     }
1426
1427     while (w)
1428     {
1429         s = combine1 (ps, pm);
1430         d = *pd;
1431
1432         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1433         w--;
1434         ps++;
1435         if (pm)
1436             pm++;
1437     }
1438 }
1439
1440 static force_inline void
1441 core_combine_add_u_sse2 (uint32_t*       dst,
1442                          const uint32_t* src,
1443                          const uint32_t* mask,
1444                          int             width)
1445 {
1446     int w = width;
1447     uint32_t s, d;
1448     uint32_t* pd = dst;
1449     const uint32_t* ps = src;
1450     const uint32_t* pm = mask;
1451
1452     /* call prefetch hint to optimize cache load*/
1453     cache_prefetch ((__m128i*)ps);
1454     cache_prefetch ((__m128i*)pd);
1455     maybe_prefetch ((__m128i*)pm);
1456
1457     while (w && (unsigned long)pd & 15)
1458     {
1459         s = combine1 (ps, pm);
1460         d = *pd;
1461
1462         ps++;
1463         if (pm)
1464             pm++;
1465         *pd++ = _mm_cvtsi64_si32 (
1466             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1467         w--;
1468     }
1469
1470     /* call prefetch hint to optimize cache load*/
1471     cache_prefetch ((__m128i*)ps);
1472     cache_prefetch ((__m128i*)pd);
1473     maybe_prefetch ((__m128i*)pm);
1474
1475     while (w >= 4)
1476     {
1477         __m128i s;
1478
1479         /* fill cache line with next memory */
1480         cache_prefetch_next ((__m128i*)ps);
1481         cache_prefetch_next ((__m128i*)pd);
1482         maybe_prefetch_next ((__m128i*)pm);
1483
1484         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1485
1486         save_128_aligned (
1487             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1488
1489         pd += 4;
1490         ps += 4;
1491         if (pm)
1492             pm += 4;
1493         w -= 4;
1494     }
1495
1496     while (w--)
1497     {
1498         s = combine1 (ps, pm);
1499         d = *pd;
1500
1501         ps++;
1502         *pd++ = _mm_cvtsi64_si32 (
1503             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1504         if (pm)
1505             pm++;
1506     }
1507 }
1508
1509 static force_inline uint32_t
1510 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1511                                     uint32_t dst)
1512 {
1513     __m64 ms = unpack_32_1x64 (src);
1514     __m64 md = unpack_32_1x64 (dst);
1515     uint32_t sa = src >> 24;
1516     uint32_t da = ~dst >> 24;
1517
1518     if (sa > da)
1519     {
1520         ms = pix_multiply_1x64 (
1521             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1522     }
1523
1524     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1525 }
1526
1527 static force_inline void
1528 core_combine_saturate_u_sse2 (uint32_t *      pd,
1529                               const uint32_t *ps,
1530                               const uint32_t *pm,
1531                               int             w)
1532 {
1533     uint32_t s, d;
1534
1535     uint32_t pack_cmp;
1536     __m128i xmm_src, xmm_dst;
1537
1538     /* call prefetch hint to optimize cache load*/
1539     cache_prefetch ((__m128i*)ps);
1540     cache_prefetch ((__m128i*)pd);
1541     maybe_prefetch ((__m128i*)pm);
1542
1543     while (w && (unsigned long)pd & 15)
1544     {
1545         s = combine1 (ps, pm);
1546         d = *pd;
1547
1548         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1549         w--;
1550         ps++;
1551         if (pm)
1552             pm++;
1553     }
1554
1555     /* call prefetch hint to optimize cache load*/
1556     cache_prefetch ((__m128i*)ps);
1557     cache_prefetch ((__m128i*)pd);
1558     maybe_prefetch ((__m128i*)pm);
1559
1560     while (w >= 4)
1561     {
1562         /* fill cache line with next memory */
1563         cache_prefetch_next ((__m128i*)ps);
1564         cache_prefetch_next ((__m128i*)pd);
1565         maybe_prefetch_next ((__m128i*)pm);
1566
1567         xmm_dst = load_128_aligned  ((__m128i*)pd);
1568         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1569
1570         pack_cmp = _mm_movemask_epi8 (
1571             _mm_cmpgt_epi32 (
1572                 _mm_srli_epi32 (xmm_src, 24),
1573                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1574
1575         /* if some alpha src is grater than respective ~alpha dst */
1576         if (pack_cmp)
1577         {
1578             s = combine1 (ps++, pm);
1579             d = *pd;
1580             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1581             if (pm)
1582                 pm++;
1583
1584             s = combine1 (ps++, pm);
1585             d = *pd;
1586             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1587             if (pm)
1588                 pm++;
1589
1590             s = combine1 (ps++, pm);
1591             d = *pd;
1592             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1593             if (pm)
1594                 pm++;
1595
1596             s = combine1 (ps++, pm);
1597             d = *pd;
1598             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1599             if (pm)
1600                 pm++;
1601         }
1602         else
1603         {
1604             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1605
1606             pd += 4;
1607             ps += 4;
1608             if (pm)
1609                 pm += 4;
1610         }
1611
1612         w -= 4;
1613     }
1614
1615     while (w--)
1616     {
1617         s = combine1 (ps, pm);
1618         d = *pd;
1619
1620         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1621         ps++;
1622         if (pm)
1623             pm++;
1624     }
1625 }
1626
1627 static force_inline void
1628 core_combine_src_ca_sse2 (uint32_t*       pd,
1629                           const uint32_t* ps,
1630                           const uint32_t *pm,
1631                           int             w)
1632 {
1633     uint32_t s, m;
1634
1635     __m128i xmm_src_lo, xmm_src_hi;
1636     __m128i xmm_mask_lo, xmm_mask_hi;
1637     __m128i xmm_dst_lo, xmm_dst_hi;
1638
1639     /* call prefetch hint to optimize cache load*/
1640     cache_prefetch ((__m128i*)ps);
1641     cache_prefetch ((__m128i*)pd);
1642     cache_prefetch ((__m128i*)pm);
1643
1644     while (w && (unsigned long)pd & 15)
1645     {
1646         s = *ps++;
1647         m = *pm++;
1648         *pd++ = pack_1x64_32 (
1649             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1650         w--;
1651     }
1652
1653     /* call prefetch hint to optimize cache load*/
1654     cache_prefetch ((__m128i*)ps);
1655     cache_prefetch ((__m128i*)pd);
1656     cache_prefetch ((__m128i*)pm);
1657
1658     while (w >= 4)
1659     {
1660         /* fill cache line with next memory */
1661         cache_prefetch_next ((__m128i*)ps);
1662         cache_prefetch_next ((__m128i*)pd);
1663         cache_prefetch_next ((__m128i*)pm);
1664
1665         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1666         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1667
1668         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1669         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1670
1671         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1672                             &xmm_mask_lo, &xmm_mask_hi,
1673                             &xmm_dst_lo, &xmm_dst_hi);
1674
1675         save_128_aligned (
1676             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1677
1678         ps += 4;
1679         pd += 4;
1680         pm += 4;
1681         w -= 4;
1682     }
1683
1684     while (w)
1685     {
1686         s = *ps++;
1687         m = *pm++;
1688         *pd++ = pack_1x64_32 (
1689             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1690         w--;
1691     }
1692 }
1693
1694 static force_inline uint32_t
1695 core_combine_over_ca_pixel_sse2 (uint32_t src,
1696                                  uint32_t mask,
1697                                  uint32_t dst)
1698 {
1699     __m64 s = unpack_32_1x64 (src);
1700     __m64 expAlpha = expand_alpha_1x64 (s);
1701     __m64 unpk_mask = unpack_32_1x64 (mask);
1702     __m64 unpk_dst  = unpack_32_1x64 (dst);
1703
1704     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1705 }
1706
1707 static force_inline void
1708 core_combine_over_ca_sse2 (uint32_t*       pd,
1709                            const uint32_t* ps,
1710                            const uint32_t *pm,
1711                            int             w)
1712 {
1713     uint32_t s, m, d;
1714
1715     __m128i xmm_alpha_lo, xmm_alpha_hi;
1716     __m128i xmm_src_lo, xmm_src_hi;
1717     __m128i xmm_dst_lo, xmm_dst_hi;
1718     __m128i xmm_mask_lo, xmm_mask_hi;
1719
1720     /* call prefetch hint to optimize cache load*/
1721     cache_prefetch ((__m128i*)ps);
1722     cache_prefetch ((__m128i*)pd);
1723     cache_prefetch ((__m128i*)pm);
1724
1725     while (w && (unsigned long)pd & 15)
1726     {
1727         s = *ps++;
1728         m = *pm++;
1729         d = *pd;
1730
1731         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1732         w--;
1733     }
1734
1735     /* call prefetch hint to optimize cache load*/
1736     cache_prefetch ((__m128i*)ps);
1737     cache_prefetch ((__m128i*)pd);
1738     cache_prefetch ((__m128i*)pm);
1739
1740     while (w >= 4)
1741     {
1742         /* fill cache line with next memory */
1743         cache_prefetch_next ((__m128i*)ps);
1744         cache_prefetch_next ((__m128i*)pd);
1745         cache_prefetch_next ((__m128i*)pm);
1746
1747         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1748         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1749         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1750
1751         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1752         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1753         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1754
1755         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1756                             &xmm_alpha_lo, &xmm_alpha_hi);
1757
1758         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1759                        &xmm_alpha_lo, &xmm_alpha_hi,
1760                        &xmm_mask_lo, &xmm_mask_hi,
1761                        &xmm_dst_lo, &xmm_dst_hi);
1762
1763         save_128_aligned (
1764             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1765
1766         ps += 4;
1767         pd += 4;
1768         pm += 4;
1769         w -= 4;
1770     }
1771
1772     while (w)
1773     {
1774         s = *ps++;
1775         m = *pm++;
1776         d = *pd;
1777
1778         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1779         w--;
1780     }
1781 }
1782
1783 static force_inline uint32_t
1784 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1785                                          uint32_t mask,
1786                                          uint32_t dst)
1787 {
1788     __m64 d = unpack_32_1x64 (dst);
1789
1790     return pack_1x64_32 (
1791         over_1x64 (d, expand_alpha_1x64 (d),
1792                    pix_multiply_1x64 (unpack_32_1x64 (src),
1793                                       unpack_32_1x64 (mask))));
1794 }
1795
1796 static force_inline void
1797 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1798                                    const uint32_t* ps,
1799                                    const uint32_t *pm,
1800                                    int             w)
1801 {
1802     uint32_t s, m, d;
1803
1804     __m128i xmm_alpha_lo, xmm_alpha_hi;
1805     __m128i xmm_src_lo, xmm_src_hi;
1806     __m128i xmm_dst_lo, xmm_dst_hi;
1807     __m128i xmm_mask_lo, xmm_mask_hi;
1808
1809     /* call prefetch hint to optimize cache load*/
1810     cache_prefetch ((__m128i*)ps);
1811     cache_prefetch ((__m128i*)pd);
1812     cache_prefetch ((__m128i*)pm);
1813
1814     while (w && (unsigned long)pd & 15)
1815     {
1816         s = *ps++;
1817         m = *pm++;
1818         d = *pd;
1819
1820         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1821         w--;
1822     }
1823
1824     /* call prefetch hint to optimize cache load*/
1825     cache_prefetch ((__m128i*)ps);
1826     cache_prefetch ((__m128i*)pd);
1827     cache_prefetch ((__m128i*)pm);
1828
1829     while (w >= 4)
1830     {
1831         /* fill cache line with next memory */
1832         cache_prefetch_next ((__m128i*)ps);
1833         cache_prefetch_next ((__m128i*)pd);
1834         cache_prefetch_next ((__m128i*)pm);
1835
1836         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1837         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1838         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1839
1840         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1841         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1842         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1843
1844         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1845                             &xmm_alpha_lo, &xmm_alpha_hi);
1846         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1847                             &xmm_mask_lo, &xmm_mask_hi,
1848                             &xmm_mask_lo, &xmm_mask_hi);
1849
1850         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1851                     &xmm_alpha_lo, &xmm_alpha_hi,
1852                     &xmm_mask_lo, &xmm_mask_hi);
1853
1854         save_128_aligned (
1855             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1856
1857         ps += 4;
1858         pd += 4;
1859         pm += 4;
1860         w -= 4;
1861     }
1862
1863     while (w)
1864     {
1865         s = *ps++;
1866         m = *pm++;
1867         d = *pd;
1868
1869         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1870         w--;
1871     }
1872 }
1873
1874 static force_inline void
1875 core_combine_in_ca_sse2 (uint32_t *      pd,
1876                          const uint32_t *ps,
1877                          const uint32_t *pm,
1878                          int             w)
1879 {
1880     uint32_t s, m, d;
1881
1882     __m128i xmm_alpha_lo, xmm_alpha_hi;
1883     __m128i xmm_src_lo, xmm_src_hi;
1884     __m128i xmm_dst_lo, xmm_dst_hi;
1885     __m128i xmm_mask_lo, xmm_mask_hi;
1886
1887     /* call prefetch hint to optimize cache load*/
1888     cache_prefetch ((__m128i*)ps);
1889     cache_prefetch ((__m128i*)pd);
1890     cache_prefetch ((__m128i*)pm);
1891
1892     while (w && (unsigned long)pd & 15)
1893     {
1894         s = *ps++;
1895         m = *pm++;
1896         d = *pd;
1897
1898         *pd++ = pack_1x64_32 (
1899             pix_multiply_1x64 (
1900                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1901                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1902
1903         w--;
1904     }
1905
1906     /* call prefetch hint to optimize cache load*/
1907     cache_prefetch ((__m128i*)ps);
1908     cache_prefetch ((__m128i*)pd);
1909     cache_prefetch ((__m128i*)pm);
1910
1911     while (w >= 4)
1912     {
1913         /* fill cache line with next memory */
1914         cache_prefetch_next ((__m128i*)ps);
1915         cache_prefetch_next ((__m128i*)pd);
1916         cache_prefetch_next ((__m128i*)pm);
1917
1918         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1919         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1920         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1921
1922         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1923         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1924         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1925
1926         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1927                             &xmm_alpha_lo, &xmm_alpha_hi);
1928
1929         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1930                             &xmm_mask_lo, &xmm_mask_hi,
1931                             &xmm_dst_lo, &xmm_dst_hi);
1932
1933         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1934                             &xmm_alpha_lo, &xmm_alpha_hi,
1935                             &xmm_dst_lo, &xmm_dst_hi);
1936
1937         save_128_aligned (
1938             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1939
1940         ps += 4;
1941         pd += 4;
1942         pm += 4;
1943         w -= 4;
1944     }
1945
1946     while (w)
1947     {
1948         s = *ps++;
1949         m = *pm++;
1950         d = *pd;
1951
1952         *pd++ = pack_1x64_32 (
1953             pix_multiply_1x64 (
1954                 pix_multiply_1x64 (
1955                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1956                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1957
1958         w--;
1959     }
1960 }
1961
1962 static force_inline void
1963 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1964                                  const uint32_t *ps,
1965                                  const uint32_t *pm,
1966                                  int             w)
1967 {
1968     uint32_t s, m, d;
1969
1970     __m128i xmm_alpha_lo, xmm_alpha_hi;
1971     __m128i xmm_src_lo, xmm_src_hi;
1972     __m128i xmm_dst_lo, xmm_dst_hi;
1973     __m128i xmm_mask_lo, xmm_mask_hi;
1974
1975     /* call prefetch hint to optimize cache load*/
1976     cache_prefetch ((__m128i*)ps);
1977     cache_prefetch ((__m128i*)pd);
1978     cache_prefetch ((__m128i*)pm);
1979
1980     while (w && (unsigned long)pd & 15)
1981     {
1982         s = *ps++;
1983         m = *pm++;
1984         d = *pd;
1985
1986         *pd++ = pack_1x64_32 (
1987             pix_multiply_1x64 (
1988                 unpack_32_1x64 (d),
1989                 pix_multiply_1x64 (unpack_32_1x64 (m),
1990                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1991         w--;
1992     }
1993
1994     /* call prefetch hint to optimize cache load*/
1995     cache_prefetch ((__m128i*)ps);
1996     cache_prefetch ((__m128i*)pd);
1997     cache_prefetch ((__m128i*)pm);
1998
1999     while (w >= 4)
2000     {
2001         /* fill cache line with next memory */
2002         cache_prefetch_next ((__m128i*)ps);
2003         cache_prefetch_next ((__m128i*)pd);
2004         cache_prefetch_next ((__m128i*)pm);
2005
2006         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2007         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2008         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2009
2010         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2011         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2012         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2013
2014         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2015                             &xmm_alpha_lo, &xmm_alpha_hi);
2016         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2017                             &xmm_alpha_lo, &xmm_alpha_hi,
2018                             &xmm_alpha_lo, &xmm_alpha_hi);
2019
2020         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2021                             &xmm_alpha_lo, &xmm_alpha_hi,
2022                             &xmm_dst_lo, &xmm_dst_hi);
2023
2024         save_128_aligned (
2025             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2026
2027         ps += 4;
2028         pd += 4;
2029         pm += 4;
2030         w -= 4;
2031     }
2032
2033     while (w)
2034     {
2035         s = *ps++;
2036         m = *pm++;
2037         d = *pd;
2038
2039         *pd++ = pack_1x64_32 (
2040             pix_multiply_1x64 (
2041                 unpack_32_1x64 (d),
2042                 pix_multiply_1x64 (unpack_32_1x64 (m),
2043                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
2044         w--;
2045     }
2046 }
2047
2048 static force_inline void
2049 core_combine_out_ca_sse2 (uint32_t *      pd,
2050                           const uint32_t *ps,
2051                           const uint32_t *pm,
2052                           int             w)
2053 {
2054     uint32_t s, m, d;
2055
2056     __m128i xmm_alpha_lo, xmm_alpha_hi;
2057     __m128i xmm_src_lo, xmm_src_hi;
2058     __m128i xmm_dst_lo, xmm_dst_hi;
2059     __m128i xmm_mask_lo, xmm_mask_hi;
2060
2061     /* call prefetch hint to optimize cache load*/
2062     cache_prefetch ((__m128i*)ps);
2063     cache_prefetch ((__m128i*)pd);
2064     cache_prefetch ((__m128i*)pm);
2065
2066     while (w && (unsigned long)pd & 15)
2067     {
2068         s = *ps++;
2069         m = *pm++;
2070         d = *pd;
2071
2072         *pd++ = pack_1x64_32 (
2073             pix_multiply_1x64 (
2074                 pix_multiply_1x64 (
2075                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2076                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2077         w--;
2078     }
2079
2080     /* call prefetch hint to optimize cache load*/
2081     cache_prefetch ((__m128i*)ps);
2082     cache_prefetch ((__m128i*)pd);
2083     cache_prefetch ((__m128i*)pm);
2084
2085     while (w >= 4)
2086     {
2087         /* fill cache line with next memory */
2088         cache_prefetch_next ((__m128i*)ps);
2089         cache_prefetch_next ((__m128i*)pd);
2090         cache_prefetch_next ((__m128i*)pm);
2091
2092         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2093         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2094         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2095
2096         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2097         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2098         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2099
2100         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2101                             &xmm_alpha_lo, &xmm_alpha_hi);
2102         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2103                       &xmm_alpha_lo, &xmm_alpha_hi);
2104
2105         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2106                             &xmm_mask_lo, &xmm_mask_hi,
2107                             &xmm_dst_lo, &xmm_dst_hi);
2108         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2109                             &xmm_alpha_lo, &xmm_alpha_hi,
2110                             &xmm_dst_lo, &xmm_dst_hi);
2111
2112         save_128_aligned (
2113             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2114
2115         ps += 4;
2116         pd += 4;
2117         pm += 4;
2118         w -= 4;
2119     }
2120
2121     while (w)
2122     {
2123         s = *ps++;
2124         m = *pm++;
2125         d = *pd;
2126
2127         *pd++ = pack_1x64_32 (
2128             pix_multiply_1x64 (
2129                 pix_multiply_1x64 (
2130                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2131                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2132
2133         w--;
2134     }
2135 }
2136
2137 static force_inline void
2138 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
2139                                   const uint32_t *ps,
2140                                   const uint32_t *pm,
2141                                   int             w)
2142 {
2143     uint32_t s, m, d;
2144
2145     __m128i xmm_alpha_lo, xmm_alpha_hi;
2146     __m128i xmm_src_lo, xmm_src_hi;
2147     __m128i xmm_dst_lo, xmm_dst_hi;
2148     __m128i xmm_mask_lo, xmm_mask_hi;
2149
2150     /* call prefetch hint to optimize cache load*/
2151     cache_prefetch ((__m128i*)ps);
2152     cache_prefetch ((__m128i*)pd);
2153     cache_prefetch ((__m128i*)pm);
2154
2155     while (w && (unsigned long)pd & 15)
2156     {
2157         s = *ps++;
2158         m = *pm++;
2159         d = *pd;
2160
2161         *pd++ = pack_1x64_32 (
2162             pix_multiply_1x64 (
2163                 unpack_32_1x64 (d),
2164                 negate_1x64 (pix_multiply_1x64 (
2165                                  unpack_32_1x64 (m),
2166                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2167         w--;
2168     }
2169
2170     /* call prefetch hint to optimize cache load*/
2171     cache_prefetch ((__m128i*)ps);
2172     cache_prefetch ((__m128i*)pd);
2173     cache_prefetch ((__m128i*)pm);
2174
2175     while (w >= 4)
2176     {
2177         /* fill cache line with next memory */
2178         cache_prefetch_next ((__m128i*)ps);
2179         cache_prefetch_next ((__m128i*)pd);
2180         cache_prefetch_next ((__m128i*)pm);
2181
2182         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2183         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2184         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2185
2186         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2187         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2188         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2189
2190         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2191                             &xmm_alpha_lo, &xmm_alpha_hi);
2192
2193         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2194                             &xmm_alpha_lo, &xmm_alpha_hi,
2195                             &xmm_mask_lo, &xmm_mask_hi);
2196
2197         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2198                       &xmm_mask_lo, &xmm_mask_hi);
2199
2200         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2201                             &xmm_mask_lo, &xmm_mask_hi,
2202                             &xmm_dst_lo, &xmm_dst_hi);
2203
2204         save_128_aligned (
2205             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2206
2207         ps += 4;
2208         pd += 4;
2209         pm += 4;
2210         w -= 4;
2211     }
2212
2213     while (w)
2214     {
2215         s = *ps++;
2216         m = *pm++;
2217         d = *pd;
2218
2219         *pd++ = pack_1x64_32 (
2220             pix_multiply_1x64 (
2221                 unpack_32_1x64 (d),
2222                 negate_1x64 (pix_multiply_1x64 (
2223                                  unpack_32_1x64 (m),
2224                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2225         w--;
2226     }
2227 }
2228
2229 static force_inline uint32_t
2230 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2231                                  uint32_t mask,
2232                                  uint32_t dst)
2233 {
2234     __m64 m = unpack_32_1x64 (mask);
2235     __m64 s = unpack_32_1x64 (src);
2236     __m64 d = unpack_32_1x64 (dst);
2237     __m64 sa = expand_alpha_1x64 (s);
2238     __m64 da = expand_alpha_1x64 (d);
2239
2240     s = pix_multiply_1x64 (s, m);
2241     m = negate_1x64 (pix_multiply_1x64 (m, sa));
2242
2243     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2244 }
2245
2246 static force_inline void
2247 core_combine_atop_ca_sse2 (uint32_t *      pd,
2248                            const uint32_t *ps,
2249                            const uint32_t *pm,
2250                            int             w)
2251 {
2252     uint32_t s, m, d;
2253
2254     __m128i xmm_src_lo, xmm_src_hi;
2255     __m128i xmm_dst_lo, xmm_dst_hi;
2256     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2257     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2258     __m128i xmm_mask_lo, xmm_mask_hi;
2259
2260     /* call prefetch hint to optimize cache load*/
2261     cache_prefetch ((__m128i*)ps);
2262     cache_prefetch ((__m128i*)pd);
2263     cache_prefetch ((__m128i*)pm);
2264
2265     while (w && (unsigned long)pd & 15)
2266     {
2267         s = *ps++;
2268         m = *pm++;
2269         d = *pd;
2270
2271         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2272         w--;
2273     }
2274
2275     /* call prefetch hint to optimize cache load*/
2276     cache_prefetch ((__m128i*)ps);
2277     cache_prefetch ((__m128i*)pd);
2278     cache_prefetch ((__m128i*)pm);
2279
2280     while (w >= 4)
2281     {
2282         /* fill cache line with next memory */
2283         cache_prefetch_next ((__m128i*)ps);
2284         cache_prefetch_next ((__m128i*)pd);
2285         cache_prefetch_next ((__m128i*)pm);
2286
2287         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2288         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2289         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2290
2291         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2292         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2293         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2294
2295         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2296                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2297         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2298                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2299
2300         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2301                             &xmm_mask_lo, &xmm_mask_hi,
2302                             &xmm_src_lo, &xmm_src_hi);
2303         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2304                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2305                             &xmm_mask_lo, &xmm_mask_hi);
2306
2307         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2308
2309         pix_add_multiply_2x128 (
2310             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2311             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2312             &xmm_dst_lo, &xmm_dst_hi);
2313
2314         save_128_aligned (
2315             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2316
2317         ps += 4;
2318         pd += 4;
2319         pm += 4;
2320         w -= 4;
2321     }
2322
2323     while (w)
2324     {
2325         s = *ps++;
2326         m = *pm++;
2327         d = *pd;
2328
2329         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2330         w--;
2331     }
2332 }
2333
2334 static force_inline uint32_t
2335 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2336                                          uint32_t mask,
2337                                          uint32_t dst)
2338 {
2339     __m64 m = unpack_32_1x64 (mask);
2340     __m64 s = unpack_32_1x64 (src);
2341     __m64 d = unpack_32_1x64 (dst);
2342
2343     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2344     __m64 sa = expand_alpha_1x64 (s);
2345
2346     s = pix_multiply_1x64 (s, m);
2347     m = pix_multiply_1x64 (m, sa);
2348
2349     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2350 }
2351
2352 static force_inline void
2353 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2354                                    const uint32_t *ps,
2355                                    const uint32_t *pm,
2356                                    int             w)
2357 {
2358     uint32_t s, m, d;
2359
2360     __m128i xmm_src_lo, xmm_src_hi;
2361     __m128i xmm_dst_lo, xmm_dst_hi;
2362     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2363     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2364     __m128i xmm_mask_lo, xmm_mask_hi;
2365
2366     /* call prefetch hint to optimize cache load*/
2367     cache_prefetch ((__m128i*)ps);
2368     cache_prefetch ((__m128i*)pd);
2369     cache_prefetch ((__m128i*)pm);
2370
2371     while (w && (unsigned long)pd & 15)
2372     {
2373         s = *ps++;
2374         m = *pm++;
2375         d = *pd;
2376
2377         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2378         w--;
2379     }
2380
2381     /* call prefetch hint to optimize cache load*/
2382     cache_prefetch ((__m128i*)ps);
2383     cache_prefetch ((__m128i*)pd);
2384     cache_prefetch ((__m128i*)pm);
2385
2386     while (w >= 4)
2387     {
2388         /* fill cache line with next memory */
2389         cache_prefetch_next ((__m128i*)ps);
2390         cache_prefetch_next ((__m128i*)pd);
2391         cache_prefetch_next ((__m128i*)pm);
2392
2393         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2394         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2395         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2396
2397         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2398         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2399         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2400
2401         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2402                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2403         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2404                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2405
2406         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2407                             &xmm_mask_lo, &xmm_mask_hi,
2408                             &xmm_src_lo, &xmm_src_hi);
2409         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2410                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2411                             &xmm_mask_lo, &xmm_mask_hi);
2412
2413         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2414                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2415
2416         pix_add_multiply_2x128 (
2417             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2418             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2419             &xmm_dst_lo, &xmm_dst_hi);
2420
2421         save_128_aligned (
2422             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2423
2424         ps += 4;
2425         pd += 4;
2426         pm += 4;
2427         w -= 4;
2428     }
2429
2430     while (w)
2431     {
2432         s = *ps++;
2433         m = *pm++;
2434         d = *pd;
2435
2436         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2437         w--;
2438     }
2439 }
2440
2441 static force_inline uint32_t
2442 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2443                                 uint32_t mask,
2444                                 uint32_t dst)
2445 {
2446     __m64 a = unpack_32_1x64 (mask);
2447     __m64 s = unpack_32_1x64 (src);
2448     __m64 d = unpack_32_1x64 (dst);
2449
2450     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2451                                        a, expand_alpha_1x64 (s)));
2452     __m64 dest      = pix_multiply_1x64 (s, a);
2453     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2454
2455     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2456                                                 &alpha_dst,
2457                                                 &dest,
2458                                                 &alpha_src));
2459 }
2460
2461 static force_inline void
2462 core_combine_xor_ca_sse2 (uint32_t *      pd,
2463                           const uint32_t *ps,
2464                           const uint32_t *pm,
2465                           int             w)
2466 {
2467     uint32_t s, m, d;
2468
2469     __m128i xmm_src_lo, xmm_src_hi;
2470     __m128i xmm_dst_lo, xmm_dst_hi;
2471     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2472     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2473     __m128i xmm_mask_lo, xmm_mask_hi;
2474
2475     /* call prefetch hint to optimize cache load*/
2476     cache_prefetch ((__m128i*)ps);
2477     cache_prefetch ((__m128i*)pd);
2478     cache_prefetch ((__m128i*)pm);
2479
2480     while (w && (unsigned long)pd & 15)
2481     {
2482         s = *ps++;
2483         m = *pm++;
2484         d = *pd;
2485
2486         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2487         w--;
2488     }
2489
2490     /* call prefetch hint to optimize cache load*/
2491     cache_prefetch ((__m128i*)ps);
2492     cache_prefetch ((__m128i*)pd);
2493     cache_prefetch ((__m128i*)pm);
2494
2495     while (w >= 4)
2496     {
2497         /* fill cache line with next memory */
2498         cache_prefetch_next ((__m128i*)ps);
2499         cache_prefetch_next ((__m128i*)pd);
2500         cache_prefetch_next ((__m128i*)pm);
2501
2502         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2503         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2504         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2505
2506         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2507         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2508         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2509
2510         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2511                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2512         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2513                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2514
2515         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2516                             &xmm_mask_lo, &xmm_mask_hi,
2517                             &xmm_src_lo, &xmm_src_hi);
2518         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2519                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2520                             &xmm_mask_lo, &xmm_mask_hi);
2521
2522         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2523                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2524         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2525                       &xmm_mask_lo, &xmm_mask_hi);
2526
2527         pix_add_multiply_2x128 (
2528             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2529             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2530             &xmm_dst_lo, &xmm_dst_hi);
2531
2532         save_128_aligned (
2533             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2534
2535         ps += 4;
2536         pd += 4;
2537         pm += 4;
2538         w -= 4;
2539     }
2540
2541     while (w)
2542     {
2543         s = *ps++;
2544         m = *pm++;
2545         d = *pd;
2546
2547         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2548         w--;
2549     }
2550 }
2551
2552 static force_inline void
2553 core_combine_add_ca_sse2 (uint32_t *      pd,
2554                           const uint32_t *ps,
2555                           const uint32_t *pm,
2556                           int             w)
2557 {
2558     uint32_t s, m, d;
2559
2560     __m128i xmm_src_lo, xmm_src_hi;
2561     __m128i xmm_dst_lo, xmm_dst_hi;
2562     __m128i xmm_mask_lo, xmm_mask_hi;
2563
2564     /* call prefetch hint to optimize cache load*/
2565     cache_prefetch ((__m128i*)ps);
2566     cache_prefetch ((__m128i*)pd);
2567     cache_prefetch ((__m128i*)pm);
2568
2569     while (w && (unsigned long)pd & 15)
2570     {
2571         s = *ps++;
2572         m = *pm++;
2573         d = *pd;
2574
2575         *pd++ = pack_1x64_32 (
2576             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2577                                              unpack_32_1x64 (m)),
2578                           unpack_32_1x64 (d)));
2579         w--;
2580     }
2581
2582     /* call prefetch hint to optimize cache load*/
2583     cache_prefetch ((__m128i*)ps);
2584     cache_prefetch ((__m128i*)pd);
2585     cache_prefetch ((__m128i*)pm);
2586
2587     while (w >= 4)
2588     {
2589         /* fill cache line with next memory */
2590         cache_prefetch_next ((__m128i*)ps);
2591         cache_prefetch_next ((__m128i*)pd);
2592         cache_prefetch_next ((__m128i*)pm);
2593
2594         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2595         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2596         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2597
2598         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2599         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2600         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2601
2602         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2603                             &xmm_mask_lo, &xmm_mask_hi,
2604                             &xmm_src_lo, &xmm_src_hi);
2605
2606         save_128_aligned (
2607             (__m128i*)pd, pack_2x128_128 (
2608                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2609                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2610
2611         ps += 4;
2612         pd += 4;
2613         pm += 4;
2614         w -= 4;
2615     }
2616
2617     while (w)
2618     {
2619         s = *ps++;
2620         m = *pm++;
2621         d = *pd;
2622
2623         *pd++ = pack_1x64_32 (
2624             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2625                                              unpack_32_1x64 (m)),
2626                           unpack_32_1x64 (d)));
2627         w--;
2628     }
2629 }
2630
2631 /* ---------------------------------------------------
2632  * fb_compose_setup_sSE2
2633  */
2634 static force_inline __m64
2635 create_mask_16_64 (uint16_t mask)
2636 {
2637     return _mm_set1_pi16 (mask);
2638 }
2639
2640 static force_inline __m128i
2641 create_mask_16_128 (uint16_t mask)
2642 {
2643     return _mm_set1_epi16 (mask);
2644 }
2645
2646 static force_inline __m64
2647 create_mask_2x32_64 (uint32_t mask0,
2648                      uint32_t mask1)
2649 {
2650     return _mm_set_pi32 (mask0, mask1);
2651 }
2652
2653 /* Work around a code generation bug in Sun Studio 12. */
2654 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2655 # define create_mask_2x32_128(mask0, mask1)                             \
2656     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2657 #else
2658 static force_inline __m128i
2659 create_mask_2x32_128 (uint32_t mask0,
2660                       uint32_t mask1)
2661 {
2662     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2663 }
2664 #endif
2665
2666 /* SSE2 code patch for fbcompose.c */
2667
2668 static void
2669 sse2_combine_over_u (pixman_implementation_t *imp,
2670                      pixman_op_t              op,
2671                      uint32_t *               dst,
2672                      const uint32_t *         src,
2673                      const uint32_t *         mask,
2674                      int                      width)
2675 {
2676     core_combine_over_u_sse2 (dst, src, mask, width);
2677     _mm_empty ();
2678 }
2679
2680 static void
2681 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2682                              pixman_op_t              op,
2683                              uint32_t *               dst,
2684                              const uint32_t *         src,
2685                              const uint32_t *         mask,
2686                              int                      width)
2687 {
2688     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2689     _mm_empty ();
2690 }
2691
2692 static void
2693 sse2_combine_in_u (pixman_implementation_t *imp,
2694                    pixman_op_t              op,
2695                    uint32_t *               dst,
2696                    const uint32_t *         src,
2697                    const uint32_t *         mask,
2698                    int                      width)
2699 {
2700     core_combine_in_u_sse2 (dst, src, mask, width);
2701     _mm_empty ();
2702 }
2703
2704 static void
2705 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2706                            pixman_op_t              op,
2707                            uint32_t *               dst,
2708                            const uint32_t *         src,
2709                            const uint32_t *         mask,
2710                            int                      width)
2711 {
2712     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2713     _mm_empty ();
2714 }
2715
2716 static void
2717 sse2_combine_out_u (pixman_implementation_t *imp,
2718                     pixman_op_t              op,
2719                     uint32_t *               dst,
2720                     const uint32_t *         src,
2721                     const uint32_t *         mask,
2722                     int                      width)
2723 {
2724     core_combine_out_u_sse2 (dst, src, mask, width);
2725     _mm_empty ();
2726 }
2727
2728 static void
2729 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2730                             pixman_op_t              op,
2731                             uint32_t *               dst,
2732                             const uint32_t *         src,
2733                             const uint32_t *         mask,
2734                             int                      width)
2735 {
2736     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2737     _mm_empty ();
2738 }
2739
2740 static void
2741 sse2_combine_atop_u (pixman_implementation_t *imp,
2742                      pixman_op_t              op,
2743                      uint32_t *               dst,
2744                      const uint32_t *         src,
2745                      const uint32_t *         mask,
2746                      int                      width)
2747 {
2748     core_combine_atop_u_sse2 (dst, src, mask, width);
2749     _mm_empty ();
2750 }
2751
2752 static void
2753 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2754                              pixman_op_t              op,
2755                              uint32_t *               dst,
2756                              const uint32_t *         src,
2757                              const uint32_t *         mask,
2758                              int                      width)
2759 {
2760     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2761     _mm_empty ();
2762 }
2763
2764 static void
2765 sse2_combine_xor_u (pixman_implementation_t *imp,
2766                     pixman_op_t              op,
2767                     uint32_t *               dst,
2768                     const uint32_t *         src,
2769                     const uint32_t *         mask,
2770                     int                      width)
2771 {
2772     core_combine_xor_u_sse2 (dst, src, mask, width);
2773     _mm_empty ();
2774 }
2775
2776 static void
2777 sse2_combine_add_u (pixman_implementation_t *imp,
2778                     pixman_op_t              op,
2779                     uint32_t *               dst,
2780                     const uint32_t *         src,
2781                     const uint32_t *         mask,
2782                     int                      width)
2783 {
2784     core_combine_add_u_sse2 (dst, src, mask, width);
2785     _mm_empty ();
2786 }
2787
2788 static void
2789 sse2_combine_saturate_u (pixman_implementation_t *imp,
2790                          pixman_op_t              op,
2791                          uint32_t *               dst,
2792                          const uint32_t *         src,
2793                          const uint32_t *         mask,
2794                          int                      width)
2795 {
2796     core_combine_saturate_u_sse2 (dst, src, mask, width);
2797     _mm_empty ();
2798 }
2799
2800 static void
2801 sse2_combine_src_ca (pixman_implementation_t *imp,
2802                      pixman_op_t              op,
2803                      uint32_t *               dst,
2804                      const uint32_t *         src,
2805                      const uint32_t *         mask,
2806                      int                      width)
2807 {
2808     core_combine_src_ca_sse2 (dst, src, mask, width);
2809     _mm_empty ();
2810 }
2811
2812 static void
2813 sse2_combine_over_ca (pixman_implementation_t *imp,
2814                       pixman_op_t              op,
2815                       uint32_t *               dst,
2816                       const uint32_t *         src,
2817                       const uint32_t *         mask,
2818                       int                      width)
2819 {
2820     core_combine_over_ca_sse2 (dst, src, mask, width);
2821     _mm_empty ();
2822 }
2823
2824 static void
2825 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2826                               pixman_op_t              op,
2827                               uint32_t *               dst,
2828                               const uint32_t *         src,
2829                               const uint32_t *         mask,
2830                               int                      width)
2831 {
2832     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2833     _mm_empty ();
2834 }
2835
2836 static void
2837 sse2_combine_in_ca (pixman_implementation_t *imp,
2838                     pixman_op_t              op,
2839                     uint32_t *               dst,
2840                     const uint32_t *         src,
2841                     const uint32_t *         mask,
2842                     int                      width)
2843 {
2844     core_combine_in_ca_sse2 (dst, src, mask, width);
2845     _mm_empty ();
2846 }
2847
2848 static void
2849 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2850                             pixman_op_t              op,
2851                             uint32_t *               dst,
2852                             const uint32_t *         src,
2853                             const uint32_t *         mask,
2854                             int                      width)
2855 {
2856     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2857     _mm_empty ();
2858 }
2859
2860 static void
2861 sse2_combine_out_ca (pixman_implementation_t *imp,
2862                      pixman_op_t              op,
2863                      uint32_t *               dst,
2864                      const uint32_t *         src,
2865                      const uint32_t *         mask,
2866                      int                      width)
2867 {
2868     core_combine_out_ca_sse2 (dst, src, mask, width);
2869     _mm_empty ();
2870 }
2871
2872 static void
2873 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2874                              pixman_op_t              op,
2875                              uint32_t *               dst,
2876                              const uint32_t *         src,
2877                              const uint32_t *         mask,
2878                              int                      width)
2879 {
2880     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2881     _mm_empty ();
2882 }
2883
2884 static void
2885 sse2_combine_atop_ca (pixman_implementation_t *imp,
2886                       pixman_op_t              op,
2887                       uint32_t *               dst,
2888                       const uint32_t *         src,
2889                       const uint32_t *         mask,
2890                       int                      width)
2891 {
2892     core_combine_atop_ca_sse2 (dst, src, mask, width);
2893     _mm_empty ();
2894 }
2895
2896 static void
2897 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2898                               pixman_op_t              op,
2899                               uint32_t *               dst,
2900                               const uint32_t *         src,
2901                               const uint32_t *         mask,
2902                               int                      width)
2903 {
2904     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2905     _mm_empty ();
2906 }
2907
2908 static void
2909 sse2_combine_xor_ca (pixman_implementation_t *imp,
2910                      pixman_op_t              op,
2911                      uint32_t *               dst,
2912                      const uint32_t *         src,
2913                      const uint32_t *         mask,
2914                      int                      width)
2915 {
2916     core_combine_xor_ca_sse2 (dst, src, mask, width);
2917     _mm_empty ();
2918 }
2919
2920 static void
2921 sse2_combine_add_ca (pixman_implementation_t *imp,
2922                      pixman_op_t              op,
2923                      uint32_t *               dst,
2924                      const uint32_t *         src,
2925                      const uint32_t *         mask,
2926                      int                      width)
2927 {
2928     core_combine_add_ca_sse2 (dst, src, mask, width);
2929     _mm_empty ();
2930 }
2931
2932 /* -------------------------------------------------------------------
2933  * composite_over_n_8888
2934  */
2935
2936 static void
2937 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2938                             pixman_op_t              op,
2939                             pixman_image_t *         src_image,
2940                             pixman_image_t *         mask_image,
2941                             pixman_image_t *         dst_image,
2942                             int32_t                  src_x,
2943                             int32_t                  src_y,
2944                             int32_t                  mask_x,
2945                             int32_t                  mask_y,
2946                             int32_t                  dest_x,
2947                             int32_t                  dest_y,
2948                             int32_t                  width,
2949                             int32_t                  height)
2950 {
2951     uint32_t src;
2952     uint32_t    *dst_line, *dst, d;
2953     int32_t w;
2954     int dst_stride;
2955     __m128i xmm_src, xmm_alpha;
2956     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2957
2958     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2959
2960     if (src == 0)
2961         return;
2962
2963     PIXMAN_IMAGE_GET_LINE (
2964         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2965
2966     xmm_src = expand_pixel_32_1x128 (src);
2967     xmm_alpha = expand_alpha_1x128 (xmm_src);
2968
2969     while (height--)
2970     {
2971         dst = dst_line;
2972
2973         /* call prefetch hint to optimize cache load*/
2974         cache_prefetch ((__m128i*)dst);
2975
2976         dst_line += dst_stride;
2977         w = width;
2978
2979         while (w && (unsigned long)dst & 15)
2980         {
2981             d = *dst;
2982             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2983                                               _mm_movepi64_pi64 (xmm_alpha),
2984                                               unpack_32_1x64 (d)));
2985             w--;
2986         }
2987
2988         cache_prefetch ((__m128i*)dst);
2989
2990         while (w >= 4)
2991         {
2992             /* fill cache line with next memory */
2993             cache_prefetch_next ((__m128i*)dst);
2994
2995             xmm_dst = load_128_aligned ((__m128i*)dst);
2996
2997             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2998
2999             over_2x128 (&xmm_src, &xmm_src,
3000                         &xmm_alpha, &xmm_alpha,
3001                         &xmm_dst_lo, &xmm_dst_hi);
3002
3003             /* rebuid the 4 pixel data and save*/
3004             save_128_aligned (
3005                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3006
3007             w -= 4;
3008             dst += 4;
3009         }
3010
3011         while (w)
3012         {
3013             d = *dst;
3014             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3015                                               _mm_movepi64_pi64 (xmm_alpha),
3016                                               unpack_32_1x64 (d)));
3017             w--;
3018         }
3019
3020     }
3021     _mm_empty ();
3022 }
3023
3024 /* ---------------------------------------------------------------------
3025  * composite_over_n_0565
3026  */
3027 static void
3028 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3029                             pixman_op_t              op,
3030                             pixman_image_t *         src_image,
3031                             pixman_image_t *         mask_image,
3032                             pixman_image_t *         dst_image,
3033                             int32_t                  src_x,
3034                             int32_t                  src_y,
3035                             int32_t                  mask_x,
3036                             int32_t                  mask_y,
3037                             int32_t                  dest_x,
3038                             int32_t                  dest_y,
3039                             int32_t                  width,
3040                             int32_t                  height)
3041 {
3042     uint32_t src;
3043     uint16_t    *dst_line, *dst, d;
3044     int32_t w;
3045     int dst_stride;
3046     __m128i xmm_src, xmm_alpha;
3047     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3048
3049     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3050
3051     if (src == 0)
3052         return;
3053
3054     PIXMAN_IMAGE_GET_LINE (
3055         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3056
3057     xmm_src = expand_pixel_32_1x128 (src);
3058     xmm_alpha = expand_alpha_1x128 (xmm_src);
3059
3060     while (height--)
3061     {
3062         dst = dst_line;
3063
3064         /* call prefetch hint to optimize cache load*/
3065         cache_prefetch ((__m128i*)dst);
3066
3067         dst_line += dst_stride;
3068         w = width;
3069
3070         while (w && (unsigned long)dst & 15)
3071         {
3072             d = *dst;
3073
3074             *dst++ = pack_565_32_16 (
3075                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3076                                          _mm_movepi64_pi64 (xmm_alpha),
3077                                          expand565_16_1x64 (d))));
3078             w--;
3079         }
3080
3081         /* call prefetch hint to optimize cache load*/
3082         cache_prefetch ((__m128i*)dst);
3083
3084         while (w >= 8)
3085         {
3086             /* fill cache line with next memory */
3087             cache_prefetch_next ((__m128i*)dst);
3088
3089             xmm_dst = load_128_aligned ((__m128i*)dst);
3090
3091             unpack_565_128_4x128 (xmm_dst,
3092                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3093
3094             over_2x128 (&xmm_src, &xmm_src,
3095                         &xmm_alpha, &xmm_alpha,
3096                         &xmm_dst0, &xmm_dst1);
3097             over_2x128 (&xmm_src, &xmm_src,
3098                         &xmm_alpha, &xmm_alpha,
3099                         &xmm_dst2, &xmm_dst3);
3100
3101             xmm_dst = pack_565_4x128_128 (
3102                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3103
3104             save_128_aligned ((__m128i*)dst, xmm_dst);
3105
3106             dst += 8;
3107             w -= 8;
3108         }
3109
3110         while (w--)
3111         {
3112             d = *dst;
3113             *dst++ = pack_565_32_16 (
3114                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3115                                          _mm_movepi64_pi64 (xmm_alpha),
3116                                          expand565_16_1x64 (d))));
3117         }
3118     }
3119
3120     _mm_empty ();
3121 }
3122
3123 /* ------------------------------
3124  * composite_add_n_8888_8888_ca
3125  */
3126 static void
3127 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3128                                    pixman_op_t              op,
3129                                    pixman_image_t *         src_image,
3130                                    pixman_image_t *         mask_image,
3131                                    pixman_image_t *         dst_image,
3132                                    int32_t                  src_x,
3133                                    int32_t                  src_y,
3134                                    int32_t                  mask_x,
3135                                    int32_t                  mask_y,
3136                                    int32_t                  dest_x,
3137                                    int32_t                  dest_y,
3138                                    int32_t                  width,
3139                                    int32_t                  height)
3140 {
3141     uint32_t src, srca;
3142     uint32_t    *dst_line, d;
3143     uint32_t    *mask_line, m;
3144     uint32_t pack_cmp;
3145     int dst_stride, mask_stride;
3146
3147     __m128i xmm_src, xmm_alpha;
3148     __m128i xmm_dst;
3149     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3150
3151     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3152
3153     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3154     srca = src >> 24;
3155
3156     if (src == 0)
3157         return;
3158
3159     PIXMAN_IMAGE_GET_LINE (
3160         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3161     PIXMAN_IMAGE_GET_LINE (
3162         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3163
3164     xmm_src = _mm_unpacklo_epi8 (
3165         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3166     xmm_alpha = expand_alpha_1x128 (xmm_src);
3167     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3168     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3169
3170     while (height--)
3171     {
3172         int w = width;
3173         const uint32_t *pm = (uint32_t *)mask_line;
3174         uint32_t *pd = (uint32_t *)dst_line;
3175
3176         dst_line += dst_stride;
3177         mask_line += mask_stride;
3178
3179         /* call prefetch hint to optimize cache load*/
3180         cache_prefetch ((__m128i*)pd);
3181         cache_prefetch ((__m128i*)pm);
3182
3183         while (w && (unsigned long)pd & 15)
3184         {
3185             m = *pm++;
3186
3187             if (m)
3188             {
3189                 d = *pd;
3190
3191                 mmx_mask = unpack_32_1x64 (m);
3192                 mmx_dest = unpack_32_1x64 (d);
3193
3194                 *pd = pack_1x64_32 (
3195                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3196             }
3197
3198             pd++;
3199             w--;
3200         }
3201
3202         /* call prefetch hint to optimize cache load*/
3203         cache_prefetch ((__m128i*)pd);
3204         cache_prefetch ((__m128i*)pm);
3205
3206         while (w >= 4)
3207         {
3208             /* fill cache line with next memory */
3209             cache_prefetch_next ((__m128i*)pd);
3210             cache_prefetch_next ((__m128i*)pm);
3211
3212             xmm_mask = load_128_unaligned ((__m128i*)pm);
3213
3214             pack_cmp =
3215                 _mm_movemask_epi8 (
3216                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3217
3218             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3219             if (pack_cmp != 0xffff)
3220             {
3221                 xmm_dst = load_128_aligned ((__m128i*)pd);
3222
3223                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3224
3225                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3226                                     &xmm_mask_lo, &xmm_mask_hi,
3227                                     &xmm_mask_lo, &xmm_mask_hi);
3228                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3229
3230                 save_128_aligned (
3231                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3232             }
3233
3234             pd += 4;
3235             pm += 4;
3236             w -= 4;
3237         }
3238
3239         while (w)
3240         {
3241             m = *pm++;
3242
3243             if (m)
3244             {
3245                 d = *pd;
3246
3247                 mmx_mask = unpack_32_1x64 (m);
3248                 mmx_dest = unpack_32_1x64 (d);
3249
3250                 *pd = pack_1x64_32 (
3251                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3252             }
3253
3254             pd++;
3255             w--;
3256         }
3257     }
3258
3259     _mm_empty ();
3260 }
3261
3262 /* ---------------------------------------------------------------------------
3263  * composite_over_n_8888_8888_ca
3264  */
3265
3266 static void
3267 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3268                                     pixman_op_t              op,
3269                                     pixman_image_t *         src_image,
3270                                     pixman_image_t *         mask_image,
3271                                     pixman_image_t *         dst_image,
3272                                     int32_t                  src_x,
3273                                     int32_t                  src_y,
3274                                     int32_t                  mask_x,
3275                                     int32_t                  mask_y,
3276                                     int32_t                  dest_x,
3277                                     int32_t                  dest_y,
3278                                     int32_t                  width,
3279                                     int32_t                  height)
3280 {
3281     uint32_t src;
3282     uint32_t    *dst_line, d;
3283     uint32_t    *mask_line, m;
3284     uint32_t pack_cmp;
3285     int dst_stride, mask_stride;
3286
3287     __m128i xmm_src, xmm_alpha;
3288     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3289     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3290
3291     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3292
3293     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3294
3295     if (src == 0)
3296         return;
3297
3298     PIXMAN_IMAGE_GET_LINE (
3299         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3300     PIXMAN_IMAGE_GET_LINE (
3301         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3302
3303     xmm_src = _mm_unpacklo_epi8 (
3304         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3305     xmm_alpha = expand_alpha_1x128 (xmm_src);
3306     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3307     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3308
3309     while (height--)
3310     {
3311         int w = width;
3312         const uint32_t *pm = (uint32_t *)mask_line;
3313         uint32_t *pd = (uint32_t *)dst_line;
3314
3315         dst_line += dst_stride;
3316         mask_line += mask_stride;
3317
3318         /* call prefetch hint to optimize cache load*/
3319         cache_prefetch ((__m128i*)pd);
3320         cache_prefetch ((__m128i*)pm);
3321
3322         while (w && (unsigned long)pd & 15)
3323         {
3324             m = *pm++;
3325
3326             if (m)
3327             {
3328                 d = *pd;
3329                 mmx_mask = unpack_32_1x64 (m);
3330                 mmx_dest = unpack_32_1x64 (d);
3331
3332                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3333                                                   &mmx_alpha,
3334                                                   &mmx_mask,
3335                                                   &mmx_dest));
3336             }
3337
3338             pd++;
3339             w--;
3340         }
3341
3342         /* call prefetch hint to optimize cache load*/
3343         cache_prefetch ((__m128i*)pd);
3344         cache_prefetch ((__m128i*)pm);
3345
3346         while (w >= 4)
3347         {
3348             /* fill cache line with next memory */
3349             cache_prefetch_next ((__m128i*)pd);
3350             cache_prefetch_next ((__m128i*)pm);
3351
3352             xmm_mask = load_128_unaligned ((__m128i*)pm);
3353
3354             pack_cmp =
3355                 _mm_movemask_epi8 (
3356                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3357
3358             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3359             if (pack_cmp != 0xffff)
3360             {
3361                 xmm_dst = load_128_aligned ((__m128i*)pd);
3362
3363                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3364                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3365
3366                 in_over_2x128 (&xmm_src, &xmm_src,
3367                                &xmm_alpha, &xmm_alpha,
3368                                &xmm_mask_lo, &xmm_mask_hi,
3369                                &xmm_dst_lo, &xmm_dst_hi);
3370
3371                 save_128_aligned (
3372                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3373             }
3374
3375             pd += 4;
3376             pm += 4;
3377             w -= 4;
3378         }
3379
3380         while (w)
3381         {
3382             m = *pm++;
3383
3384             if (m)
3385             {
3386                 d = *pd;
3387                 mmx_mask = unpack_32_1x64 (m);
3388                 mmx_dest = unpack_32_1x64 (d);
3389
3390                 *pd = pack_1x64_32 (
3391                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3392             }
3393
3394             pd++;
3395             w--;
3396         }
3397     }
3398
3399     _mm_empty ();
3400 }
3401
3402 /*---------------------------------------------------------------------
3403  * composite_over_8888_n_8888
3404  */
3405
3406 static void
3407 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3408                                  pixman_op_t              op,
3409                                  pixman_image_t *         src_image,
3410                                  pixman_image_t *         mask_image,
3411                                  pixman_image_t *         dst_image,
3412                                  int32_t                  src_x,
3413                                  int32_t                  src_y,
3414                                  int32_t                  mask_x,
3415                                  int32_t                  mask_y,
3416                                  int32_t                  dest_x,
3417                                  int32_t                  dest_y,
3418                                  int32_t                  width,
3419                                  int32_t                  height)
3420 {
3421     uint32_t    *dst_line, *dst;
3422     uint32_t    *src_line, *src;
3423     uint32_t mask;
3424     int32_t w;
3425     int dst_stride, src_stride;
3426
3427     __m128i xmm_mask;
3428     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3429     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3430     __m128i xmm_alpha_lo, xmm_alpha_hi;
3431
3432     PIXMAN_IMAGE_GET_LINE (
3433         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3434     PIXMAN_IMAGE_GET_LINE (
3435         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3436
3437     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3438
3439     xmm_mask = create_mask_16_128 (mask >> 24);
3440
3441     while (height--)
3442     {
3443         dst = dst_line;
3444         dst_line += dst_stride;
3445         src = src_line;
3446         src_line += src_stride;
3447         w = width;
3448
3449         /* call prefetch hint to optimize cache load*/
3450         cache_prefetch ((__m128i*)dst);
3451         cache_prefetch ((__m128i*)src);
3452
3453         while (w && (unsigned long)dst & 15)
3454         {
3455             uint32_t s = *src++;
3456             uint32_t d = *dst;
3457
3458             __m64 ms = unpack_32_1x64 (s);
3459             __m64 alpha    = expand_alpha_1x64 (ms);
3460             __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3461             __m64 alpha_dst = unpack_32_1x64 (d);
3462
3463             *dst++ = pack_1x64_32 (
3464                 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3465
3466             w--;
3467         }
3468
3469         /* call prefetch hint to optimize cache load*/
3470         cache_prefetch ((__m128i*)dst);
3471         cache_prefetch ((__m128i*)src);
3472
3473         while (w >= 4)
3474         {
3475             /* fill cache line with next memory */
3476             cache_prefetch_next ((__m128i*)dst);
3477             cache_prefetch_next ((__m128i*)src);
3478
3479             xmm_src = load_128_unaligned ((__m128i*)src);
3480             xmm_dst = load_128_aligned ((__m128i*)dst);
3481
3482             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3483             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3484             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3485                                 &xmm_alpha_lo, &xmm_alpha_hi);
3486
3487             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3488                            &xmm_alpha_lo, &xmm_alpha_hi,
3489                            &xmm_mask, &xmm_mask,
3490                            &xmm_dst_lo, &xmm_dst_hi);
3491
3492             save_128_aligned (
3493                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3494
3495             dst += 4;
3496             src += 4;
3497             w -= 4;
3498         }
3499
3500         while (w)
3501         {
3502             uint32_t s = *src++;
3503             uint32_t d = *dst;
3504
3505             __m64 ms = unpack_32_1x64 (s);
3506             __m64 alpha = expand_alpha_1x64 (ms);
3507             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3508             __m64 dest  = unpack_32_1x64 (d);
3509
3510             *dst++ = pack_1x64_32 (
3511                 in_over_1x64 (&ms, &alpha, &mask, &dest));
3512
3513             w--;
3514         }
3515     }
3516
3517     _mm_empty ();
3518 }
3519
3520 /*---------------------------------------------------------------------
3521  * composite_over_8888_n_8888
3522  */
3523
3524 static void
3525 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
3526                               pixman_op_t              op,
3527                               pixman_image_t *         src_image,
3528                               pixman_image_t *         mask_image,
3529                               pixman_image_t *         dst_image,
3530                               int32_t                  src_x,
3531                               int32_t                  src_y,
3532                               int32_t                  mask_x,
3533                               int32_t                  mask_y,
3534                               int32_t                  dest_x,
3535                               int32_t                  dest_y,
3536                               int32_t                  width,
3537                               int32_t                  height)
3538 {
3539     uint32_t    *dst_line, *dst;
3540     uint32_t    *src_line, *src;
3541     int32_t w;
3542     int dst_stride, src_stride;
3543
3544
3545     PIXMAN_IMAGE_GET_LINE (
3546         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3547     PIXMAN_IMAGE_GET_LINE (
3548         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3549
3550     while (height--)
3551     {
3552         dst = dst_line;
3553         dst_line += dst_stride;
3554         src = src_line;
3555         src_line += src_stride;
3556         w = width;
3557
3558         /* call prefetch hint to optimize cache load*/
3559         cache_prefetch ((__m128i*)src);
3560
3561         while (w && (unsigned long)dst & 15)
3562         {
3563             *dst++ = *src++ | 0xff000000;
3564             w--;
3565         }
3566
3567         /* call prefetch hint to optimize cache load*/
3568         cache_prefetch ((__m128i*)src);
3569
3570         while (w >= 16)
3571         {
3572             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
3573             
3574             /* fill cache line with next memory */
3575             cache_prefetch_next ((__m128i*)src);
3576
3577             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
3578             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
3579             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
3580             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
3581             
3582             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
3583             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
3584             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
3585             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
3586             
3587             dst += 16;
3588             src += 16;
3589             w -= 16;
3590         }
3591
3592         while (w)
3593         {
3594             *dst++ = *src++ | 0xff000000;
3595             w--;
3596         }
3597     }
3598
3599     _mm_empty ();
3600 }
3601
3602 /* ---------------------------------------------------------------------
3603  * composite_over_x888_n_8888
3604  */
3605 static void
3606 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3607                                  pixman_op_t              op,
3608                                  pixman_image_t *         src_image,
3609                                  pixman_image_t *         mask_image,
3610                                  pixman_image_t *         dst_image,
3611                                  int32_t                  src_x,
3612                                  int32_t                  src_y,
3613                                  int32_t                  mask_x,
3614                                  int32_t                  mask_y,
3615                                  int32_t                  dest_x,
3616                                  int32_t                  dest_y,
3617                                  int32_t                  width,
3618                                  int32_t                  height)
3619 {
3620     uint32_t    *dst_line, *dst;
3621     uint32_t    *src_line, *src;
3622     uint32_t mask;
3623     int dst_stride, src_stride;
3624     int32_t w;
3625
3626     __m128i xmm_mask, xmm_alpha;
3627     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3628     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3629
3630     PIXMAN_IMAGE_GET_LINE (
3631         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3632     PIXMAN_IMAGE_GET_LINE (
3633         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3634
3635     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3636
3637     xmm_mask = create_mask_16_128 (mask >> 24);
3638     xmm_alpha = mask_00ff;
3639
3640     while (height--)
3641     {
3642         dst = dst_line;
3643         dst_line += dst_stride;
3644         src = src_line;
3645         src_line += src_stride;
3646         w = width;
3647
3648         /* call prefetch hint to optimize cache load*/
3649         cache_prefetch ((__m128i*)dst);
3650         cache_prefetch ((__m128i*)src);
3651
3652         while (w && (unsigned long)dst & 15)
3653         {
3654             uint32_t s = (*src++) | 0xff000000;
3655             uint32_t d = *dst;
3656
3657             __m64 src   = unpack_32_1x64 (s);
3658             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3659             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3660             __m64 dest  = unpack_32_1x64 (d);
3661
3662             *dst++ = pack_1x64_32 (
3663                 in_over_1x64 (&src, &alpha, &mask, &dest));
3664
3665             w--;
3666         }
3667
3668         /* call prefetch hint to optimize cache load*/
3669         cache_prefetch ((__m128i*)dst);
3670         cache_prefetch ((__m128i*)src);
3671
3672         while (w >= 4)
3673         {
3674             /* fill cache line with next memory */
3675             cache_prefetch_next ((__m128i*)dst);
3676             cache_prefetch_next ((__m128i*)src);
3677
3678             xmm_src = _mm_or_si128 (
3679                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3680             xmm_dst = load_128_aligned ((__m128i*)dst);
3681
3682             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3683             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3684
3685             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3686                            &xmm_alpha, &xmm_alpha,
3687                            &xmm_mask, &xmm_mask,
3688                            &xmm_dst_lo, &xmm_dst_hi);
3689
3690             save_128_aligned (
3691                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3692
3693             dst += 4;
3694             src += 4;
3695             w -= 4;
3696
3697         }
3698
3699         while (w)
3700         {
3701             uint32_t s = (*src++) | 0xff000000;
3702             uint32_t d = *dst;
3703
3704             __m64 src  = unpack_32_1x64 (s);
3705             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3706             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3707             __m64 dest  = unpack_32_1x64 (d);
3708
3709             *dst++ = pack_1x64_32 (
3710                 in_over_1x64 (&src, &alpha, &mask, &dest));
3711
3712             w--;
3713         }
3714     }
3715
3716     _mm_empty ();
3717 }
3718
3719 /* --------------------------------------------------------------------
3720  * composite_over_8888_8888
3721  */
3722 static void
3723 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3724                                pixman_op_t              op,
3725                                pixman_image_t *         src_image,
3726                                pixman_image_t *         mask_image,
3727                                pixman_image_t *         dst_image,
3728                                int32_t                  src_x,
3729                                int32_t                  src_y,
3730                                int32_t                  mask_x,
3731                                int32_t                  mask_y,
3732                                int32_t                  dest_x,
3733                                int32_t                  dest_y,
3734                                int32_t                  width,
3735                                int32_t                  height)
3736 {
3737     int dst_stride, src_stride;
3738     uint32_t    *dst_line, *dst;
3739     uint32_t    *src_line, *src;
3740
3741     PIXMAN_IMAGE_GET_LINE (
3742         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3743     PIXMAN_IMAGE_GET_LINE (
3744         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3745
3746     dst = dst_line;
3747     src = src_line;
3748
3749     while (height--)
3750     {
3751         core_combine_over_u_sse2 (dst, src, NULL, width);
3752
3753         dst += dst_stride;
3754         src += src_stride;
3755     }
3756     _mm_empty ();
3757 }
3758
3759 /* ------------------------------------------------------------------
3760  * composite_over_8888_0565
3761  */
3762 static force_inline uint16_t
3763 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3764 {
3765     __m64 ms;
3766
3767     ms = unpack_32_1x64 (src);
3768     return pack_565_32_16 (
3769         pack_1x64_32 (
3770             over_1x64 (
3771                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3772 }
3773
3774 static void
3775 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3776                                pixman_op_t              op,
3777                                pixman_image_t *         src_image,
3778                                pixman_image_t *         mask_image,
3779                                pixman_image_t *         dst_image,
3780                                int32_t                  src_x,
3781                                int32_t                  src_y,
3782                                int32_t                  mask_x,
3783                                int32_t                  mask_y,
3784                                int32_t                  dest_x,
3785                                int32_t                  dest_y,
3786                                int32_t                  width,
3787                                int32_t                  height)
3788 {
3789     uint16_t    *dst_line, *dst, d;
3790     uint32_t    *src_line, *src, s;
3791     int dst_stride, src_stride;
3792     int32_t w;
3793
3794     __m128i xmm_alpha_lo, xmm_alpha_hi;
3795     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3796     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3797
3798     PIXMAN_IMAGE_GET_LINE (
3799         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3800     PIXMAN_IMAGE_GET_LINE (
3801         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3802
3803 #if 0
3804     /* FIXME
3805      *
3806      * I copy the code from MMX one and keep the fixme.
3807      * If it's a problem there, probably is a problem here.
3808      */
3809     assert (src_image->drawable == mask_image->drawable);
3810 #endif
3811
3812     while (height--)
3813     {
3814         dst = dst_line;
3815         src = src_line;
3816
3817         /* call prefetch hint to optimize cache load*/
3818         cache_prefetch ((__m128i*)src);
3819         cache_prefetch ((__m128i*)dst);
3820
3821         dst_line += dst_stride;
3822         src_line += src_stride;
3823         w = width;
3824
3825         /* Align dst on a 16-byte boundary */
3826         while (w &&
3827                ((unsigned long)dst & 15))
3828         {
3829             s = *src++;
3830             d = *dst;
3831
3832             *dst++ = composite_over_8888_0565pixel (s, d);
3833             w--;
3834         }
3835
3836         /* call prefetch hint to optimize cache load*/
3837         cache_prefetch ((__m128i*)src);
3838         cache_prefetch ((__m128i*)dst);
3839
3840         /* It's a 8 pixel loop */
3841         while (w >= 8)
3842         {
3843             /* fill cache line with next memory */
3844             cache_prefetch_next ((__m128i*)src);
3845             cache_prefetch_next ((__m128i*)dst);
3846
3847             /* I'm loading unaligned because I'm not sure
3848              * about the address alignment.
3849              */
3850             xmm_src = load_128_unaligned ((__m128i*) src);
3851             xmm_dst = load_128_aligned ((__m128i*) dst);
3852
3853             /* Unpacking */
3854             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3855             unpack_565_128_4x128 (xmm_dst,
3856                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3857             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3858                                 &xmm_alpha_lo, &xmm_alpha_hi);
3859
3860             /* I'm loading next 4 pixels from memory
3861              * before to optimze the memory read.
3862              */
3863             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3864
3865             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3866                         &xmm_alpha_lo, &xmm_alpha_hi,
3867                         &xmm_dst0, &xmm_dst1);
3868
3869             /* Unpacking */
3870             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3871             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3872                                 &xmm_alpha_lo, &xmm_alpha_hi);
3873
3874             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3875                         &xmm_alpha_lo, &xmm_alpha_hi,
3876                         &xmm_dst2, &xmm_dst3);
3877
3878             save_128_aligned (
3879                 (__m128i*)dst, pack_565_4x128_128 (
3880                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3881
3882             w -= 8;
3883             dst += 8;
3884             src += 8;
3885         }
3886
3887         while (w--)
3888         {
3889             s = *src++;
3890             d = *dst;
3891
3892             *dst++ = composite_over_8888_0565pixel (s, d);
3893         }
3894     }
3895
3896     _mm_empty ();
3897 }
3898
3899 /* -----------------------------------------------------------------
3900  * composite_over_n_8_8888
3901  */
3902
3903 static void
3904 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3905                               pixman_op_t              op,
3906                               pixman_image_t *         src_image,
3907                               pixman_image_t *         mask_image,
3908                               pixman_image_t *         dst_image,
3909                               int32_t                  src_x,
3910                               int32_t                  src_y,
3911                               int32_t                  mask_x,
3912                               int32_t                  mask_y,
3913                               int32_t                  dest_x,
3914                               int32_t                  dest_y,
3915                               int32_t                  width,
3916                               int32_t                  height)
3917 {
3918     uint32_t src, srca;
3919     uint32_t *dst_line, *dst;
3920     uint8_t *mask_line, *mask;
3921     int dst_stride, mask_stride;
3922     int32_t w;
3923     uint32_t m, d;
3924
3925     __m128i xmm_src, xmm_alpha, xmm_def;
3926     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3927     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3928
3929     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3930
3931     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3932
3933     srca = src >> 24;
3934     if (src == 0)
3935         return;
3936
3937     PIXMAN_IMAGE_GET_LINE (
3938         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3939     PIXMAN_IMAGE_GET_LINE (
3940         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3941
3942     xmm_def = create_mask_2x32_128 (src, src);
3943     xmm_src = expand_pixel_32_1x128 (src);
3944     xmm_alpha = expand_alpha_1x128 (xmm_src);
3945     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3946     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3947
3948     while (height--)
3949     {
3950         dst = dst_line;
3951         dst_line += dst_stride;
3952         mask = mask_line;
3953         mask_line += mask_stride;
3954         w = width;
3955
3956         /* call prefetch hint to optimize cache load*/
3957         cache_prefetch ((__m128i*)mask);
3958         cache_prefetch ((__m128i*)dst);
3959
3960         while (w && (unsigned long)dst & 15)
3961         {
3962             uint8_t m = *mask++;
3963
3964             if (m)
3965             {
3966                 d = *dst;
3967                 mmx_mask = expand_pixel_8_1x64 (m);
3968                 mmx_dest = unpack_32_1x64 (d);
3969
3970                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3971                                                    &mmx_alpha,
3972                                                    &mmx_mask,
3973                                                    &mmx_dest));
3974             }
3975
3976             w--;
3977             dst++;
3978         }
3979
3980         /* call prefetch hint to optimize cache load*/
3981         cache_prefetch ((__m128i*)mask);
3982         cache_prefetch ((__m128i*)dst);
3983
3984         while (w >= 4)
3985         {
3986             /* fill cache line with next memory */
3987             cache_prefetch_next ((__m128i*)mask);
3988             cache_prefetch_next ((__m128i*)dst);
3989
3990             m = *((uint32_t*)mask);
3991
3992             if (srca == 0xff && m == 0xffffffff)
3993             {
3994                 save_128_aligned ((__m128i*)dst, xmm_def);
3995             }
3996             else if (m)
3997             {
3998                 xmm_dst = load_128_aligned ((__m128i*) dst);
3999                 xmm_mask = unpack_32_1x128 (m);
4000                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4001
4002                 /* Unpacking */
4003                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4004                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4005
4006                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4007                                         &xmm_mask_lo, &xmm_mask_hi);
4008
4009                 in_over_2x128 (&xmm_src, &xmm_src,
4010                                &xmm_alpha, &xmm_alpha,
4011                                &xmm_mask_lo, &xmm_mask_hi,
4012                                &xmm_dst_lo, &xmm_dst_hi);
4013
4014                 save_128_aligned (
4015                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4016             }
4017
4018             w -= 4;
4019             dst += 4;
4020             mask += 4;
4021         }
4022
4023         while (w)
4024         {
4025             uint8_t m = *mask++;
4026
4027             if (m)
4028             {
4029                 d = *dst;
4030                 mmx_mask = expand_pixel_8_1x64 (m);
4031                 mmx_dest = unpack_32_1x64 (d);
4032
4033                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
4034                                                    &mmx_alpha,
4035                                                    &mmx_mask,
4036                                                    &mmx_dest));
4037             }
4038
4039             w--;
4040             dst++;
4041         }
4042     }
4043
4044     _mm_empty ();
4045 }
4046
4047 /* ----------------------------------------------------------------
4048  * composite_over_n_8_8888
4049  */
4050
4051 pixman_bool_t
4052 pixman_fill_sse2 (uint32_t *bits,
4053                   int       stride,
4054                   int       bpp,
4055                   int       x,
4056                   int       y,
4057                   int       width,
4058                   int       height,
4059                   uint32_t  data)
4060 {
4061     uint32_t byte_width;
4062     uint8_t         *byte_line;
4063
4064     __m128i xmm_def;
4065
4066     if (bpp == 8)
4067     {
4068         uint8_t b;
4069         uint16_t w;
4070
4071         stride = stride * (int) sizeof (uint32_t) / 1;
4072         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
4073         byte_width = width;
4074         stride *= 1;
4075
4076         b = data & 0xff;
4077         w = (b << 8) | b;
4078         data = (w << 16) | w;
4079     }
4080     else if (bpp == 16)
4081     {
4082         stride = stride * (int) sizeof (uint32_t) / 2;
4083         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
4084         byte_width = 2 * width;
4085         stride *= 2;
4086
4087         data = (data & 0xffff) * 0x00010001;
4088     }
4089     else if (bpp == 32)
4090     {
4091         stride = stride * (int) sizeof (uint32_t) / 4;
4092         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
4093         byte_width = 4 * width;
4094         stride *= 4;
4095     }
4096     else
4097     {
4098         return FALSE;
4099     }
4100
4101     cache_prefetch ((__m128i*)byte_line);
4102     xmm_def = create_mask_2x32_128 (data, data);
4103
4104     while (height--)
4105     {
4106         int w;
4107         uint8_t *d = byte_line;
4108         byte_line += stride;
4109         w = byte_width;
4110
4111         cache_prefetch_next ((__m128i*)d);
4112
4113         while (w >= 1 && ((unsigned long)d & 1))
4114         {
4115             *(uint8_t *)d = data;
4116             w -= 1;
4117             d += 1;
4118         }
4119
4120         while (w >= 2 && ((unsigned long)d & 3))
4121         {
4122             *(uint16_t *)d = data;
4123             w -= 2;
4124             d += 2;
4125         }
4126
4127         while (w >= 4 && ((unsigned long)d & 15))
4128         {
4129             *(uint32_t *)d = data;
4130
4131             w -= 4;
4132             d += 4;
4133         }
4134
4135         cache_prefetch_next ((__m128i*)d);
4136
4137         while (w >= 128)
4138         {
4139             cache_prefetch (((__m128i*)d) + 12);
4140
4141             save_128_aligned ((__m128i*)(d),     xmm_def);
4142             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4143             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4144             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4145             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
4146             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
4147             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
4148             save_128_aligned ((__m128i*)(d + 112), xmm_def);
4149
4150             d += 128;
4151             w -= 128;
4152         }
4153
4154         if (w >= 64)
4155         {
4156             cache_prefetch (((__m128i*)d) + 8);
4157
4158             save_128_aligned ((__m128i*)(d),     xmm_def);
4159             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4160             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4161             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4162
4163             d += 64;
4164             w -= 64;
4165         }
4166
4167         cache_prefetch_next ((__m128i*)d);
4168
4169         if (w >= 32)
4170         {
4171             save_128_aligned ((__m128i*)(d),     xmm_def);
4172             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4173
4174             d += 32;
4175             w -= 32;
4176         }
4177
4178         if (w >= 16)
4179         {
4180             save_128_aligned ((__m128i*)(d),     xmm_def);
4181
4182             d += 16;
4183             w -= 16;
4184         }
4185
4186         cache_prefetch_next ((__m128i*)d);
4187
4188         while (w >= 4)
4189         {
4190             *(uint32_t *)d = data;
4191
4192             w -= 4;
4193             d += 4;
4194         }
4195
4196         if (w >= 2)
4197         {
4198             *(uint16_t *)d = data;
4199             w -= 2;
4200             d += 2;
4201         }
4202
4203         if (w >= 1)
4204         {
4205             *(uint8_t *)d = data;
4206             w -= 1;
4207             d += 1;
4208         }
4209     }
4210
4211     _mm_empty ();
4212     return TRUE;
4213 }
4214
4215 static void
4216 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4217                              pixman_op_t              op,
4218                              pixman_image_t *         src_image,
4219                              pixman_image_t *         mask_image,
4220                              pixman_image_t *         dst_image,
4221                              int32_t                  src_x,
4222                              int32_t                  src_y,
4223                              int32_t                  mask_x,
4224                              int32_t                  mask_y,
4225                              int32_t                  dest_x,
4226                              int32_t                  dest_y,
4227                              int32_t                  width,
4228                              int32_t                  height)
4229 {
4230     uint32_t src, srca;
4231     uint32_t    *dst_line, *dst;
4232     uint8_t     *mask_line, *mask;
4233     int dst_stride, mask_stride;
4234     int32_t w;
4235     uint32_t m;
4236
4237     __m128i xmm_src, xmm_def;
4238     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4239
4240     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4241
4242     srca = src >> 24;
4243     if (src == 0)
4244     {
4245         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4246                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
4247                           dest_x, dest_y, width, height, 0);
4248         return;
4249     }
4250
4251     PIXMAN_IMAGE_GET_LINE (
4252         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4253     PIXMAN_IMAGE_GET_LINE (
4254         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4255
4256     xmm_def = create_mask_2x32_128 (src, src);
4257     xmm_src = expand_pixel_32_1x128 (src);
4258
4259     while (height--)
4260     {
4261         dst = dst_line;
4262         dst_line += dst_stride;
4263         mask = mask_line;
4264         mask_line += mask_stride;
4265         w = width;
4266
4267         /* call prefetch hint to optimize cache load*/
4268         cache_prefetch ((__m128i*)mask);
4269         cache_prefetch ((__m128i*)dst);
4270
4271         while (w && (unsigned long)dst & 15)
4272         {
4273             uint8_t m = *mask++;
4274
4275             if (m)
4276             {
4277                 *dst = pack_1x64_32 (
4278                     pix_multiply_1x64 (
4279                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4280             }
4281             else
4282             {
4283                 *dst = 0;
4284             }
4285
4286             w--;
4287             dst++;
4288         }
4289
4290         /* call prefetch hint to optimize cache load*/
4291         cache_prefetch ((__m128i*)mask);
4292         cache_prefetch ((__m128i*)dst);
4293
4294         while (w >= 4)
4295         {
4296             /* fill cache line with next memory */
4297             cache_prefetch_next ((__m128i*)mask);
4298             cache_prefetch_next ((__m128i*)dst);
4299
4300             m = *((uint32_t*)mask);
4301
4302             if (srca == 0xff && m == 0xffffffff)
4303             {
4304                 save_128_aligned ((__m128i*)dst, xmm_def);
4305             }
4306             else if (m)
4307             {
4308                 xmm_mask = unpack_32_1x128 (m);
4309                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4310
4311                 /* Unpacking */
4312                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4313
4314                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4315                                         &xmm_mask_lo, &xmm_mask_hi);
4316
4317                 pix_multiply_2x128 (&xmm_src, &xmm_src,
4318                                     &xmm_mask_lo, &xmm_mask_hi,
4319                                     &xmm_mask_lo, &xmm_mask_hi);
4320
4321                 save_128_aligned (
4322                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4323             }
4324             else
4325             {
4326                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4327             }
4328
4329             w -= 4;
4330             dst += 4;
4331             mask += 4;
4332         }
4333
4334         while (w)
4335         {
4336             uint8_t m = *mask++;
4337
4338             if (m)
4339             {
4340                 *dst = pack_1x64_32 (
4341                     pix_multiply_1x64 (
4342                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4343             }
4344             else
4345             {
4346                 *dst = 0;
4347             }
4348
4349             w--;
4350             dst++;
4351         }
4352     }
4353
4354     _mm_empty ();
4355 }
4356
4357 /*-----------------------------------------------------------------------
4358  * composite_over_n_8_0565
4359  */
4360
4361 static void
4362 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4363                               pixman_op_t              op,
4364                               pixman_image_t *         src_image,
4365                               pixman_image_t *         mask_image,
4366                               pixman_image_t *         dst_image,
4367                               int32_t                  src_x,
4368                               int32_t                  src_y,
4369                               int32_t                  mask_x,
4370                               int32_t                  mask_y,
4371                               int32_t                  dest_x,
4372                               int32_t                  dest_y,
4373                               int32_t                  width,
4374                               int32_t                  height)
4375 {
4376     uint32_t src, srca;
4377     uint16_t    *dst_line, *dst, d;
4378     uint8_t     *mask_line, *mask;
4379     int dst_stride, mask_stride;
4380     int32_t w;
4381     uint32_t m;
4382     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4383
4384     __m128i xmm_src, xmm_alpha;
4385     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4386     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4387
4388     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4389
4390     srca = src >> 24;
4391     if (src == 0)
4392         return;
4393
4394     PIXMAN_IMAGE_GET_LINE (
4395         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4396     PIXMAN_IMAGE_GET_LINE (
4397         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4398
4399     xmm_src = expand_pixel_32_1x128 (src);
4400     xmm_alpha = expand_alpha_1x128 (xmm_src);
4401     mmx_src = _mm_movepi64_pi64 (xmm_src);
4402     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4403
4404     while (height--)
4405     {
4406         dst = dst_line;
4407         dst_line += dst_stride;
4408         mask = mask_line;
4409         mask_line += mask_stride;
4410         w = width;
4411
4412         /* call prefetch hint to optimize cache load*/
4413         cache_prefetch ((__m128i*)mask);
4414         cache_prefetch ((__m128i*)dst);
4415
4416         while (w && (unsigned long)dst & 15)
4417         {
4418             m = *mask++;
4419
4420             if (m)
4421             {
4422                 d = *dst;
4423                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4424                 mmx_dest = expand565_16_1x64 (d);
4425
4426                 *dst = pack_565_32_16 (
4427                     pack_1x64_32 (
4428                         in_over_1x64 (
4429                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4430             }
4431
4432             w--;
4433             dst++;
4434         }
4435
4436         /* call prefetch hint to optimize cache load*/
4437         cache_prefetch ((__m128i*)mask);
4438         cache_prefetch ((__m128i*)dst);
4439
4440         while (w >= 8)
4441         {
4442             /* fill cache line with next memory */
4443             cache_prefetch_next ((__m128i*)mask);
4444             cache_prefetch_next ((__m128i*)dst);
4445
4446             xmm_dst = load_128_aligned ((__m128i*) dst);
4447             unpack_565_128_4x128 (xmm_dst,
4448                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4449
4450             m = *((uint32_t*)mask);
4451             mask += 4;
4452
4453             if (m)
4454             {
4455                 xmm_mask = unpack_32_1x128 (m);
4456                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4457
4458                 /* Unpacking */
4459                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4460
4461                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4462                                         &xmm_mask_lo, &xmm_mask_hi);
4463
4464                 in_over_2x128 (&xmm_src, &xmm_src,
4465                                &xmm_alpha, &xmm_alpha,
4466                                &xmm_mask_lo, &xmm_mask_hi,
4467                                &xmm_dst0, &xmm_dst1);
4468             }
4469
4470             m = *((uint32_t*)mask);
4471             mask += 4;
4472
4473             if (m)
4474             {
4475                 xmm_mask = unpack_32_1x128 (m);
4476                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4477
4478                 /* Unpacking */
4479                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4480
4481                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4482                                         &xmm_mask_lo, &xmm_mask_hi);
4483                 in_over_2x128 (&xmm_src, &xmm_src,
4484                                &xmm_alpha, &xmm_alpha,
4485                                &xmm_mask_lo, &xmm_mask_hi,
4486                                &xmm_dst2, &xmm_dst3);
4487             }
4488
4489             save_128_aligned (
4490                 (__m128i*)dst, pack_565_4x128_128 (
4491                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4492
4493             w -= 8;
4494             dst += 8;
4495         }
4496
4497         while (w)
4498         {
4499             m = *mask++;
4500
4501             if (m)
4502             {
4503                 d = *dst;
4504                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4505                 mmx_dest = expand565_16_1x64 (d);
4506
4507                 *dst = pack_565_32_16 (
4508                     pack_1x64_32 (
4509                         in_over_1x64 (
4510                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4511             }
4512
4513             w--;
4514             dst++;
4515         }
4516     }
4517
4518     _mm_empty ();
4519 }
4520
4521 /* -----------------------------------------------------------------------
4522  * composite_over_pixbuf_0565
4523  */
4524
4525 static void
4526 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4527                                  pixman_op_t              op,
4528                                  pixman_image_t *         src_image,
4529                                  pixman_image_t *         mask_image,
4530                                  pixman_image_t *         dst_image,
4531                                  int32_t                  src_x,
4532                                  int32_t                  src_y,
4533                                  int32_t                  mask_x,
4534                                  int32_t                  mask_y,
4535                                  int32_t                  dest_x,
4536                                  int32_t                  dest_y,
4537                                  int32_t                  width,
4538                                  int32_t                  height)
4539 {
4540     uint16_t    *dst_line, *dst, d;
4541     uint32_t    *src_line, *src, s;
4542     int dst_stride, src_stride;
4543     int32_t w;
4544     uint32_t opaque, zero;
4545
4546     __m64 ms;
4547     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4548     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4549
4550     PIXMAN_IMAGE_GET_LINE (
4551         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4552     PIXMAN_IMAGE_GET_LINE (
4553         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4554
4555 #if 0
4556     /* FIXME
4557      *
4558      * I copy the code from MMX one and keep the fixme.
4559      * If it's a problem there, probably is a problem here.
4560      */
4561     assert (src_image->drawable == mask_image->drawable);
4562 #endif
4563
4564     while (height--)
4565     {
4566         dst = dst_line;
4567         dst_line += dst_stride;
4568         src = src_line;
4569         src_line += src_stride;
4570         w = width;
4571
4572         /* call prefetch hint to optimize cache load*/
4573         cache_prefetch ((__m128i*)src);
4574         cache_prefetch ((__m128i*)dst);
4575
4576         while (w && (unsigned long)dst & 15)
4577         {
4578             s = *src++;
4579             d = *dst;
4580
4581             ms = unpack_32_1x64 (s);
4582
4583             *dst++ = pack_565_32_16 (
4584                 pack_1x64_32 (
4585                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4586             w--;
4587         }
4588
4589         /* call prefetch hint to optimize cache load*/
4590         cache_prefetch ((__m128i*)src);
4591         cache_prefetch ((__m128i*)dst);
4592
4593         while (w >= 8)
4594         {
4595             /* fill cache line with next memory */
4596             cache_prefetch_next ((__m128i*)src);
4597             cache_prefetch_next ((__m128i*)dst);
4598
4599             /* First round */
4600             xmm_src = load_128_unaligned ((__m128i*)src);
4601             xmm_dst = load_128_aligned  ((__m128i*)dst);
4602
4603             opaque = is_opaque (xmm_src);
4604             zero = is_zero (xmm_src);
4605
4606             unpack_565_128_4x128 (xmm_dst,
4607                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4608             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4609
4610             /* preload next round*/
4611             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4612
4613             if (opaque)
4614             {
4615                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4616                                      &xmm_dst0, &xmm_dst1);
4617             }
4618             else if (!zero)
4619             {
4620                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4621                                         &xmm_dst0, &xmm_dst1);
4622             }
4623
4624             /* Second round */
4625             opaque = is_opaque (xmm_src);
4626             zero = is_zero (xmm_src);
4627
4628             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4629
4630             if (opaque)
4631             {
4632                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4633                                      &xmm_dst2, &xmm_dst3);
4634             }
4635             else if (!zero)
4636             {
4637                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4638                                         &xmm_dst2, &xmm_dst3);
4639             }
4640
4641             save_128_aligned (
4642                 (__m128i*)dst, pack_565_4x128_128 (
4643                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4644
4645             w -= 8;
4646             src += 8;
4647             dst += 8;
4648         }
4649
4650         while (w)
4651         {
4652             s = *src++;
4653             d = *dst;
4654
4655             ms = unpack_32_1x64 (s);
4656
4657             *dst++ = pack_565_32_16 (
4658                 pack_1x64_32 (
4659                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4660             w--;
4661         }
4662     }
4663
4664     _mm_empty ();
4665 }
4666
4667 /* -------------------------------------------------------------------------
4668  * composite_over_pixbuf_8888
4669  */
4670
4671 static void
4672 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4673                                  pixman_op_t              op,
4674                                  pixman_image_t *         src_image,
4675                                  pixman_image_t *         mask_image,
4676                                  pixman_image_t *         dst_image,
4677                                  int32_t                  src_x,
4678                                  int32_t                  src_y,
4679                                  int32_t                  mask_x,
4680                                  int32_t                  mask_y,
4681                                  int32_t                  dest_x,
4682                                  int32_t                  dest_y,
4683                                  int32_t                  width,
4684                                  int32_t                  height)
4685 {
4686     uint32_t    *dst_line, *dst, d;
4687     uint32_t    *src_line, *src, s;
4688     int dst_stride, src_stride;
4689     int32_t w;
4690     uint32_t opaque, zero;
4691
4692     __m128i xmm_src_lo, xmm_src_hi;
4693     __m128i xmm_dst_lo, xmm_dst_hi;
4694
4695     PIXMAN_IMAGE_GET_LINE (
4696         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4697     PIXMAN_IMAGE_GET_LINE (
4698         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4699
4700 #if 0
4701     /* FIXME
4702      *
4703      * I copy the code from MMX one and keep the fixme.
4704      * If it's a problem there, probably is a problem here.
4705      */
4706     assert (src_image->drawable == mask_image->drawable);
4707 #endif
4708
4709     while (height--)
4710     {
4711         dst = dst_line;
4712         dst_line += dst_stride;
4713         src = src_line;
4714         src_line += src_stride;
4715         w = width;
4716
4717         /* call prefetch hint to optimize cache load*/
4718         cache_prefetch ((__m128i*)src);
4719         cache_prefetch ((__m128i*)dst);
4720
4721         while (w && (unsigned long)dst & 15)
4722         {
4723             s = *src++;
4724             d = *dst;
4725
4726             *dst++ = pack_1x64_32 (
4727                 over_rev_non_pre_1x64 (
4728                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4729
4730             w--;
4731         }
4732
4733         /* call prefetch hint to optimize cache load*/
4734         cache_prefetch ((__m128i*)src);
4735         cache_prefetch ((__m128i*)dst);
4736
4737         while (w >= 4)
4738         {
4739             /* fill cache line with next memory */
4740             cache_prefetch_next ((__m128i*)src);
4741             cache_prefetch_next ((__m128i*)dst);
4742
4743             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4744
4745             opaque = is_opaque (xmm_src_hi);
4746             zero = is_zero (xmm_src_hi);
4747
4748             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4749
4750             if (opaque)
4751             {
4752                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4753                                      &xmm_dst_lo, &xmm_dst_hi);
4754
4755                 save_128_aligned (
4756                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4757             }
4758             else if (!zero)
4759             {
4760                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4761
4762                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4763
4764                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4765                                         &xmm_dst_lo, &xmm_dst_hi);
4766
4767                 save_128_aligned (
4768                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4769             }
4770
4771             w -= 4;
4772             dst += 4;
4773             src += 4;
4774         }
4775
4776         while (w)
4777         {
4778             s = *src++;
4779             d = *dst;
4780
4781             *dst++ = pack_1x64_32 (
4782                 over_rev_non_pre_1x64 (
4783                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4784
4785             w--;
4786         }
4787     }
4788
4789     _mm_empty ();
4790 }
4791
4792 /* -------------------------------------------------------------------------------------------------
4793  * composite_over_n_8888_0565_ca
4794  */
4795
4796 static void
4797 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4798                                     pixman_op_t              op,
4799                                     pixman_image_t *         src_image,
4800                                     pixman_image_t *         mask_image,
4801                                     pixman_image_t *         dst_image,
4802                                     int32_t                  src_x,
4803                                     int32_t                  src_y,
4804                                     int32_t                  mask_x,
4805                                     int32_t                  mask_y,
4806                                     int32_t                  dest_x,
4807                                     int32_t                  dest_y,
4808                                     int32_t                  width,
4809                                     int32_t                  height)
4810 {
4811     uint32_t src;
4812     uint16_t    *dst_line, *dst, d;
4813     uint32_t    *mask_line, *mask, m;
4814     int dst_stride, mask_stride;
4815     int w;
4816     uint32_t pack_cmp;
4817
4818     __m128i xmm_src, xmm_alpha;
4819     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4820     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4821
4822     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4823
4824     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4825
4826     if (src == 0)
4827         return;
4828
4829     PIXMAN_IMAGE_GET_LINE (
4830         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4831     PIXMAN_IMAGE_GET_LINE (
4832         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4833
4834     xmm_src = expand_pixel_32_1x128 (src);
4835     xmm_alpha = expand_alpha_1x128 (xmm_src);
4836     mmx_src = _mm_movepi64_pi64 (xmm_src);
4837     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4838
4839     while (height--)
4840     {
4841         w = width;
4842         mask = mask_line;
4843         dst = dst_line;
4844         mask_line += mask_stride;
4845         dst_line += dst_stride;
4846
4847         /* call prefetch hint to optimize cache load*/
4848         cache_prefetch ((__m128i*)mask);
4849         cache_prefetch ((__m128i*)dst);
4850
4851         while (w && ((unsigned long)dst & 15))
4852         {
4853             m = *(uint32_t *) mask;
4854
4855             if (m)
4856             {
4857                 d = *dst;
4858                 mmx_mask = unpack_32_1x64 (m);
4859                 mmx_dest = expand565_16_1x64 (d);
4860
4861                 *dst = pack_565_32_16 (
4862                     pack_1x64_32 (
4863                         in_over_1x64 (
4864                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4865             }
4866
4867             w--;
4868             dst++;
4869             mask++;
4870         }
4871
4872         /* call prefetch hint to optimize cache load*/
4873         cache_prefetch ((__m128i*)mask);
4874         cache_prefetch ((__m128i*)dst);
4875
4876         while (w >= 8)
4877         {
4878             /* fill cache line with next memory */
4879             cache_prefetch_next ((__m128i*)mask);
4880             cache_prefetch_next ((__m128i*)dst);
4881
4882             /* First round */
4883             xmm_mask = load_128_unaligned ((__m128i*)mask);
4884             xmm_dst = load_128_aligned ((__m128i*)dst);
4885
4886             pack_cmp = _mm_movemask_epi8 (
4887                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4888
4889             unpack_565_128_4x128 (xmm_dst,
4890                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4891             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4892
4893             /* preload next round */
4894             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4895
4896             /* preload next round */
4897             if (pack_cmp != 0xffff)
4898             {
4899                 in_over_2x128 (&xmm_src, &xmm_src,
4900                                &xmm_alpha, &xmm_alpha,
4901                                &xmm_mask_lo, &xmm_mask_hi,
4902                                &xmm_dst0, &xmm_dst1);
4903             }
4904
4905             /* Second round */
4906             pack_cmp = _mm_movemask_epi8 (
4907                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4908
4909             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4910
4911             if (pack_cmp != 0xffff)
4912             {
4913                 in_over_2x128 (&xmm_src, &xmm_src,
4914                                &xmm_alpha, &xmm_alpha,
4915                                &xmm_mask_lo, &xmm_mask_hi,
4916                                &xmm_dst2, &xmm_dst3);
4917             }
4918
4919             save_128_aligned (
4920                 (__m128i*)dst, pack_565_4x128_128 (
4921                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4922
4923             w -= 8;
4924             dst += 8;
4925             mask += 8;
4926         }
4927
4928         while (w)
4929         {
4930             m = *(uint32_t *) mask;
4931
4932             if (m)
4933             {
4934                 d = *dst;
4935                 mmx_mask = unpack_32_1x64 (m);
4936                 mmx_dest = expand565_16_1x64 (d);
4937
4938                 *dst = pack_565_32_16 (
4939                     pack_1x64_32 (
4940                         in_over_1x64 (
4941                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4942             }
4943
4944             w--;
4945             dst++;
4946             mask++;
4947         }
4948     }
4949
4950     _mm_empty ();
4951 }
4952
4953 /* -----------------------------------------------------------------------
4954  * composite_in_n_8_8
4955  */
4956
4957 static void
4958 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4959                          pixman_op_t              op,
4960                          pixman_image_t *         src_image,
4961                          pixman_image_t *         mask_image,
4962                          pixman_image_t *         dst_image,
4963                          int32_t                  src_x,
4964                          int32_t                  src_y,
4965                          int32_t                  mask_x,
4966                          int32_t                  mask_y,
4967                          int32_t                  dest_x,
4968                          int32_t                  dest_y,
4969                          int32_t                  width,
4970                          int32_t                  height)
4971 {
4972     uint8_t     *dst_line, *dst;
4973     uint8_t     *mask_line, *mask;
4974     int dst_stride, mask_stride;
4975     uint32_t d, m;
4976     uint32_t src;
4977     uint8_t sa;
4978     int32_t w;
4979
4980     __m128i xmm_alpha;
4981     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4982     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4983
4984     PIXMAN_IMAGE_GET_LINE (
4985         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4986     PIXMAN_IMAGE_GET_LINE (
4987         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4988
4989     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4990
4991     sa = src >> 24;
4992
4993     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4994
4995     while (height--)
4996     {
4997         dst = dst_line;
4998         dst_line += dst_stride;
4999         mask = mask_line;
5000         mask_line += mask_stride;
5001         w = width;
5002
5003         /* call prefetch hint to optimize cache load*/
5004         cache_prefetch ((__m128i*)mask);
5005         cache_prefetch ((__m128i*)dst);
5006
5007         while (w && ((unsigned long)dst & 15))
5008         {
5009             m = (uint32_t) *mask++;
5010             d = (uint32_t) *dst;
5011
5012             *dst++ = (uint8_t) pack_1x64_32 (
5013                 pix_multiply_1x64 (
5014                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
5015                                        unpack_32_1x64 (m)),
5016                     unpack_32_1x64 (d)));
5017             w--;
5018         }
5019
5020         /* call prefetch hint to optimize cache load*/
5021         cache_prefetch ((__m128i*)mask);
5022         cache_prefetch ((__m128i*)dst);
5023
5024         while (w >= 16)
5025         {
5026             /* fill cache line with next memory */
5027             cache_prefetch_next ((__m128i*)mask);
5028             cache_prefetch_next ((__m128i*)dst);
5029
5030             xmm_mask = load_128_unaligned ((__m128i*)mask);
5031             xmm_dst = load_128_aligned ((__m128i*)dst);
5032
5033             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5034             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5035
5036             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5037                                 &xmm_mask_lo, &xmm_mask_hi,
5038                                 &xmm_mask_lo, &xmm_mask_hi);
5039
5040             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
5041                                 &xmm_dst_lo, &xmm_dst_hi,
5042                                 &xmm_dst_lo, &xmm_dst_hi);
5043
5044             save_128_aligned (
5045                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5046
5047             mask += 16;
5048             dst += 16;
5049             w -= 16;
5050         }
5051
5052         while (w)
5053         {
5054             m = (uint32_t) *mask++;
5055             d = (uint32_t) *dst;
5056
5057             *dst++ = (uint8_t) pack_1x64_32 (
5058                 pix_multiply_1x64 (
5059                     pix_multiply_1x64 (
5060                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5061                     unpack_32_1x64 (d)));
5062             w--;
5063         }
5064     }
5065
5066     _mm_empty ();
5067 }
5068
5069 /* ---------------------------------------------------------------------------
5070  * composite_in_8_8
5071  */
5072
5073 static void
5074 sse2_composite_in_8_8 (pixman_implementation_t *imp,
5075                        pixman_op_t              op,
5076                        pixman_image_t *         src_image,
5077                        pixman_image_t *         mask_image,
5078                        pixman_image_t *         dst_image,
5079                        int32_t                  src_x,
5080                        int32_t                  src_y,
5081                        int32_t                  mask_x,
5082                        int32_t                  mask_y,
5083                        int32_t                  dest_x,
5084                        int32_t                  dest_y,
5085                        int32_t                  width,
5086                        int32_t                  height)
5087 {
5088     uint8_t     *dst_line, *dst;
5089     uint8_t     *src_line, *src;
5090     int src_stride, dst_stride;
5091     int32_t w;
5092     uint32_t s, d;
5093
5094     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5095     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5096
5097     PIXMAN_IMAGE_GET_LINE (
5098         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5099     PIXMAN_IMAGE_GET_LINE (
5100         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5101
5102     while (height--)
5103     {
5104         dst = dst_line;
5105         dst_line += dst_stride;
5106         src = src_line;
5107         src_line += src_stride;
5108         w = width;
5109
5110         /* call prefetch hint to optimize cache load*/
5111         cache_prefetch ((__m128i*)src);
5112         cache_prefetch ((__m128i*)dst);
5113
5114         while (w && ((unsigned long)dst & 15))
5115         {
5116             s = (uint32_t) *src++;
5117             d = (uint32_t) *dst;
5118
5119             *dst++ = (uint8_t) pack_1x64_32 (
5120                 pix_multiply_1x64 (
5121                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
5122             w--;
5123         }
5124
5125         /* call prefetch hint to optimize cache load*/
5126         cache_prefetch ((__m128i*)src);
5127         cache_prefetch ((__m128i*)dst);
5128
5129         while (w >= 16)
5130         {
5131             /* fill cache line with next memory */
5132             cache_prefetch_next ((__m128i*)src);
5133             cache_prefetch_next ((__m128i*)dst);
5134
5135             xmm_src = load_128_unaligned ((__m128i*)src);
5136             xmm_dst = load_128_aligned ((__m128i*)dst);
5137
5138             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5139             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5140
5141             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5142                                 &xmm_dst_lo, &xmm_dst_hi,
5143                                 &xmm_dst_lo, &xmm_dst_hi);
5144
5145             save_128_aligned (
5146                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5147
5148             src += 16;
5149             dst += 16;
5150             w -= 16;
5151         }
5152
5153         while (w)
5154         {
5155             s = (uint32_t) *src++;
5156             d = (uint32_t) *dst;
5157
5158             *dst++ = (uint8_t) pack_1x64_32 (
5159                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5160             w--;
5161         }
5162     }
5163
5164     _mm_empty ();
5165 }
5166
5167 /* -------------------------------------------------------------------------
5168  * composite_add_n_8_8
5169  */
5170
5171 static void
5172 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
5173                           pixman_op_t              op,
5174                           pixman_image_t *         src_image,
5175                           pixman_image_t *         mask_image,
5176                           pixman_image_t *         dst_image,
5177                           int32_t                  src_x,
5178                           int32_t                  src_y,
5179                           int32_t                  mask_x,
5180                           int32_t                  mask_y,
5181                           int32_t                  dest_x,
5182                           int32_t                  dest_y,
5183                           int32_t                  width,
5184                           int32_t                  height)
5185 {
5186     uint8_t     *dst_line, *dst;
5187     uint8_t     *mask_line, *mask;
5188     int dst_stride, mask_stride;
5189     int32_t w;
5190     uint32_t src;
5191     uint8_t sa;
5192     uint32_t m, d;
5193
5194     __m128i xmm_alpha;
5195     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5196     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5197
5198     PIXMAN_IMAGE_GET_LINE (
5199         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5200     PIXMAN_IMAGE_GET_LINE (
5201         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5202
5203     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5204
5205     sa = src >> 24;
5206
5207     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5208
5209     while (height--)
5210     {
5211         dst = dst_line;
5212         dst_line += dst_stride;
5213         mask = mask_line;
5214         mask_line += mask_stride;
5215         w = width;
5216
5217         /* call prefetch hint to optimize cache load*/
5218         cache_prefetch ((__m128i*)mask);
5219         cache_prefetch ((__m128i*)dst);
5220
5221         while (w && ((unsigned long)dst & 15))
5222         {
5223             m = (uint32_t) *mask++;
5224             d = (uint32_t) *dst;
5225
5226             *dst++ = (uint8_t) pack_1x64_32 (
5227                 _mm_adds_pu16 (
5228                     pix_multiply_1x64 (
5229                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5230                     unpack_32_1x64 (d)));
5231             w--;
5232         }
5233
5234         /* call prefetch hint to optimize cache load*/
5235         cache_prefetch ((__m128i*)mask);
5236         cache_prefetch ((__m128i*)dst);
5237
5238         while (w >= 16)
5239         {
5240             /* fill cache line with next memory */
5241             cache_prefetch_next ((__m128i*)mask);
5242             cache_prefetch_next ((__m128i*)dst);
5243
5244             xmm_mask = load_128_unaligned ((__m128i*)mask);
5245             xmm_dst = load_128_aligned ((__m128i*)dst);
5246
5247             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5248             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5249
5250             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5251                                 &xmm_mask_lo, &xmm_mask_hi,
5252                                 &xmm_mask_lo, &xmm_mask_hi);
5253
5254             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5255             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5256
5257             save_128_aligned (
5258                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5259
5260             mask += 16;
5261             dst += 16;
5262             w -= 16;
5263         }
5264
5265         while (w)
5266         {
5267             m = (uint32_t) *mask++;
5268             d = (uint32_t) *dst;
5269
5270             *dst++ = (uint8_t) pack_1x64_32 (
5271                 _mm_adds_pu16 (
5272                     pix_multiply_1x64 (
5273                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5274                     unpack_32_1x64 (d)));
5275
5276             w--;
5277         }
5278     }
5279
5280     _mm_empty ();
5281 }
5282
5283 /* ----------------------------------------------------------------------
5284  * composite_add_8000_8000
5285  */
5286
5287 static void
5288 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5289                               pixman_op_t              op,
5290                               pixman_image_t *         src_image,
5291                               pixman_image_t *         mask_image,
5292                               pixman_image_t *         dst_image,
5293                               int32_t                  src_x,
5294                               int32_t                  src_y,
5295                               int32_t                  mask_x,
5296                               int32_t                  mask_y,
5297                               int32_t                  dest_x,
5298                               int32_t                  dest_y,
5299                               int32_t                  width,
5300                               int32_t                  height)
5301 {
5302     uint8_t     *dst_line, *dst;
5303     uint8_t     *src_line, *src;
5304     int dst_stride, src_stride;
5305     int32_t w;
5306     uint16_t t;
5307
5308     PIXMAN_IMAGE_GET_LINE (
5309         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5310     PIXMAN_IMAGE_GET_LINE (
5311         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5312
5313     while (height--)
5314     {
5315         dst = dst_line;
5316         src = src_line;
5317
5318         /* call prefetch hint to optimize cache load*/
5319         cache_prefetch ((__m128i*)src);
5320         cache_prefetch ((__m128i*)dst);
5321
5322         dst_line += dst_stride;
5323         src_line += src_stride;
5324         w = width;
5325
5326         /* Small head */
5327         while (w && (unsigned long)dst & 3)
5328         {
5329             t = (*dst) + (*src++);
5330             *dst++ = t | (0 - (t >> 8));
5331             w--;
5332         }
5333
5334         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5335
5336         /* Small tail */
5337         dst += w & 0xfffc;
5338         src += w & 0xfffc;
5339
5340         w &= 3;
5341
5342         while (w)
5343         {
5344             t = (*dst) + (*src++);
5345             *dst++ = t | (0 - (t >> 8));
5346             w--;
5347         }
5348     }
5349
5350     _mm_empty ();
5351 }
5352
5353 /* ---------------------------------------------------------------------
5354  * composite_add_8888_8888
5355  */
5356 static void
5357 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5358                               pixman_op_t              op,
5359                               pixman_image_t *         src_image,
5360                               pixman_image_t *         mask_image,
5361                               pixman_image_t *         dst_image,
5362                               int32_t                  src_x,
5363                               int32_t                  src_y,
5364                               int32_t                  mask_x,
5365                               int32_t                  mask_y,
5366                               int32_t                  dest_x,
5367                               int32_t                  dest_y,
5368                               int32_t                  width,
5369                               int32_t                  height)
5370 {
5371     uint32_t    *dst_line, *dst;
5372     uint32_t    *src_line, *src;
5373     int dst_stride, src_stride;
5374
5375     PIXMAN_IMAGE_GET_LINE (
5376         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5377     PIXMAN_IMAGE_GET_LINE (
5378         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5379
5380     while (height--)
5381     {
5382         dst = dst_line;
5383         dst_line += dst_stride;
5384         src = src_line;
5385         src_line += src_stride;
5386
5387         core_combine_add_u_sse2 (dst, src, NULL, width);
5388     }
5389
5390     _mm_empty ();
5391 }
5392
5393 /* -------------------------------------------------------------------------------------------------
5394  * sse2_composite_copy_area
5395  */
5396
5397 static pixman_bool_t
5398 pixman_blt_sse2 (uint32_t *src_bits,
5399                  uint32_t *dst_bits,
5400                  int       src_stride,
5401                  int       dst_stride,
5402                  int       src_bpp,
5403                  int       dst_bpp,
5404                  int       src_x,
5405                  int       src_y,
5406                  int       dst_x,
5407                  int       dst_y,
5408                  int       width,
5409                  int       height)
5410 {
5411     uint8_t *   src_bytes;
5412     uint8_t *   dst_bytes;
5413     int byte_width;
5414
5415     if (src_bpp != dst_bpp)
5416         return FALSE;
5417
5418     if (src_bpp == 16)
5419     {
5420         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5421         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5422         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5423         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5424         byte_width = 2 * width;
5425         src_stride *= 2;
5426         dst_stride *= 2;
5427     }
5428     else if (src_bpp == 32)
5429     {
5430         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5431         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5432         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5433         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5434         byte_width = 4 * width;
5435         src_stride *= 4;
5436         dst_stride *= 4;
5437     }
5438     else
5439     {
5440         return FALSE;
5441     }
5442
5443     cache_prefetch ((__m128i*)src_bytes);
5444     cache_prefetch ((__m128i*)dst_bytes);
5445
5446     while (height--)
5447     {
5448         int w;
5449         uint8_t *s = src_bytes;
5450         uint8_t *d = dst_bytes;
5451         src_bytes += src_stride;
5452         dst_bytes += dst_stride;
5453         w = byte_width;
5454
5455         cache_prefetch_next ((__m128i*)s);
5456         cache_prefetch_next ((__m128i*)d);
5457
5458         while (w >= 2 && ((unsigned long)d & 3))
5459         {
5460             *(uint16_t *)d = *(uint16_t *)s;
5461             w -= 2;
5462             s += 2;
5463             d += 2;
5464         }
5465
5466         while (w >= 4 && ((unsigned long)d & 15))
5467         {
5468             *(uint32_t *)d = *(uint32_t *)s;
5469
5470             w -= 4;
5471             s += 4;
5472             d += 4;
5473         }
5474
5475         cache_prefetch_next ((__m128i*)s);
5476         cache_prefetch_next ((__m128i*)d);
5477
5478         while (w >= 64)
5479         {
5480             __m128i xmm0, xmm1, xmm2, xmm3;
5481
5482             /* 128 bytes ahead */
5483             cache_prefetch (((__m128i*)s) + 8);
5484             cache_prefetch (((__m128i*)d) + 8);
5485
5486             xmm0 = load_128_unaligned ((__m128i*)(s));
5487             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5488             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5489             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5490
5491             save_128_aligned ((__m128i*)(d),    xmm0);
5492             save_128_aligned ((__m128i*)(d + 16), xmm1);
5493             save_128_aligned ((__m128i*)(d + 32), xmm2);
5494             save_128_aligned ((__m128i*)(d + 48), xmm3);
5495
5496             s += 64;
5497             d += 64;
5498             w -= 64;
5499         }
5500
5501         cache_prefetch_next ((__m128i*)s);
5502         cache_prefetch_next ((__m128i*)d);
5503
5504         while (w >= 16)
5505         {
5506             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5507
5508             w -= 16;
5509             d += 16;
5510             s += 16;
5511         }
5512
5513         cache_prefetch_next ((__m128i*)s);
5514         cache_prefetch_next ((__m128i*)d);
5515
5516         while (w >= 4)
5517         {
5518             *(uint32_t *)d = *(uint32_t *)s;
5519
5520             w -= 4;
5521             s += 4;
5522             d += 4;
5523         }
5524
5525         if (w >= 2)
5526         {
5527             *(uint16_t *)d = *(uint16_t *)s;
5528             w -= 2;
5529             s += 2;
5530             d += 2;
5531         }
5532     }
5533
5534     _mm_empty ();
5535
5536     return TRUE;
5537 }
5538
5539 static void
5540 sse2_composite_copy_area (pixman_implementation_t *imp,
5541                           pixman_op_t              op,
5542                           pixman_image_t *         src_image,
5543                           pixman_image_t *         mask_image,
5544                           pixman_image_t *         dst_image,
5545                           int32_t                  src_x,
5546                           int32_t                  src_y,
5547                           int32_t                  mask_x,
5548                           int32_t                  mask_y,
5549                           int32_t                  dest_x,
5550                           int32_t                  dest_y,
5551                           int32_t                  width,
5552                           int32_t                  height)
5553 {
5554     pixman_blt_sse2 (src_image->bits.bits,
5555                      dst_image->bits.bits,
5556                      src_image->bits.rowstride,
5557                      dst_image->bits.rowstride,
5558                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5559                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5560                      src_x, src_y, dest_x, dest_y, width, height);
5561 }
5562
5563 static void
5564 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5565                                  pixman_op_t              op,
5566                                  pixman_image_t *         src_image,
5567                                  pixman_image_t *         mask_image,
5568                                  pixman_image_t *         dst_image,
5569                                  int32_t                  src_x,
5570                                  int32_t                  src_y,
5571                                  int32_t                  mask_x,
5572                                  int32_t                  mask_y,
5573                                  int32_t                  dest_x,
5574                                  int32_t                  dest_y,
5575                                  int32_t                  width,
5576                                  int32_t                  height)
5577 {
5578     uint32_t    *src, *src_line, s;
5579     uint32_t    *dst, *dst_line, d;
5580     uint8_t         *mask, *mask_line;
5581     uint32_t m;
5582     int src_stride, mask_stride, dst_stride;
5583     int32_t w;
5584     __m64 ms;
5585
5586     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5587     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5588     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5589
5590     PIXMAN_IMAGE_GET_LINE (
5591         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5592     PIXMAN_IMAGE_GET_LINE (
5593         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5594     PIXMAN_IMAGE_GET_LINE (
5595         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5596
5597     while (height--)
5598     {
5599         src = src_line;
5600         src_line += src_stride;
5601         dst = dst_line;
5602         dst_line += dst_stride;
5603         mask = mask_line;
5604         mask_line += mask_stride;
5605
5606         w = width;
5607
5608         /* call prefetch hint to optimize cache load*/
5609         cache_prefetch ((__m128i*)src);
5610         cache_prefetch ((__m128i*)dst);
5611         cache_prefetch ((__m128i*)mask);
5612
5613         while (w && (unsigned long)dst & 15)
5614         {
5615             s = 0xff000000 | *src++;
5616             m = (uint32_t) *mask++;
5617             d = *dst;
5618             ms = unpack_32_1x64 (s);
5619
5620             if (m != 0xff)
5621             {
5622                 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5623                 __m64 md = unpack_32_1x64 (d);
5624
5625                 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5626             }
5627
5628             *dst++ = pack_1x64_32 (ms);
5629             w--;
5630         }
5631
5632         /* call prefetch hint to optimize cache load*/
5633         cache_prefetch ((__m128i*)src);
5634         cache_prefetch ((__m128i*)dst);
5635         cache_prefetch ((__m128i*)mask);
5636
5637         while (w >= 4)
5638         {
5639             /* fill cache line with next memory */
5640             cache_prefetch_next ((__m128i*)src);
5641             cache_prefetch_next ((__m128i*)dst);
5642             cache_prefetch_next ((__m128i*)mask);
5643
5644             m = *(uint32_t*) mask;
5645             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5646
5647             if (m == 0xffffffff)
5648             {
5649                 save_128_aligned ((__m128i*)dst, xmm_src);
5650             }
5651             else
5652             {
5653                 xmm_dst = load_128_aligned ((__m128i*)dst);
5654
5655                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5656
5657                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5658                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5659                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5660
5661                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5662
5663                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5664
5665                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5666             }
5667
5668             src += 4;
5669             dst += 4;
5670             mask += 4;
5671             w -= 4;
5672         }
5673
5674         while (w)
5675         {
5676             m = (uint32_t) *mask++;
5677
5678             if (m)
5679             {
5680                 s = 0xff000000 | *src;
5681
5682                 if (m == 0xff)
5683                 {
5684                     *dst = s;
5685                 }
5686                 else
5687                 {
5688                     __m64 ma, md, ms;
5689
5690                     d = *dst;
5691
5692                     ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5693                     md = unpack_32_1x64 (d);
5694                     ms = unpack_32_1x64 (s);
5695
5696                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5697                 }
5698
5699             }
5700
5701             src++;
5702             dst++;
5703             w--;
5704         }
5705     }
5706
5707     _mm_empty ();
5708 }
5709
5710 static void
5711 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5712                                  pixman_op_t              op,
5713                                  pixman_image_t *         src_image,
5714                                  pixman_image_t *         mask_image,
5715                                  pixman_image_t *         dst_image,
5716                                  int32_t                  src_x,
5717                                  int32_t                  src_y,
5718                                  int32_t                  mask_x,
5719                                  int32_t                  mask_y,
5720                                  int32_t                  dest_x,
5721                                  int32_t                  dest_y,
5722                                  int32_t                  width,
5723                                  int32_t                  height)
5724 {
5725     uint32_t    *src, *src_line, s;
5726     uint32_t    *dst, *dst_line, d;
5727     uint8_t         *mask, *mask_line;
5728     uint32_t m;
5729     int src_stride, mask_stride, dst_stride;
5730     int32_t w;
5731
5732     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5733     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5734     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5735
5736     PIXMAN_IMAGE_GET_LINE (
5737         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5738     PIXMAN_IMAGE_GET_LINE (
5739         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5740     PIXMAN_IMAGE_GET_LINE (
5741         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5742
5743     while (height--)
5744     {
5745         src = src_line;
5746         src_line += src_stride;
5747         dst = dst_line;
5748         dst_line += dst_stride;
5749         mask = mask_line;
5750         mask_line += mask_stride;
5751
5752         w = width;
5753
5754         /* call prefetch hint to optimize cache load*/
5755         cache_prefetch ((__m128i *)src);
5756         cache_prefetch ((__m128i *)dst);
5757         cache_prefetch ((__m128i *)mask);
5758
5759         while (w && (unsigned long)dst & 15)
5760         {
5761             uint32_t sa;
5762
5763             s = *src++;
5764             m = (uint32_t) *mask++;
5765             d = *dst;
5766
5767             sa = s >> 24;
5768
5769             if (m)
5770             {
5771                 if (sa == 0xff && m == 0xff)
5772                 {
5773                     *dst = s;
5774                 }
5775                 else
5776                 {
5777                     __m64 ms, md, ma, msa;
5778
5779                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5780                     ms = unpack_32_1x64 (s);
5781                     md = unpack_32_1x64 (d);
5782
5783                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5784
5785                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5786                 }
5787             }
5788
5789             dst++;
5790             w--;
5791         }
5792
5793         /* call prefetch hint to optimize cache load*/
5794         cache_prefetch ((__m128i *)src);
5795         cache_prefetch ((__m128i *)dst);
5796         cache_prefetch ((__m128i *)mask);
5797
5798         while (w >= 4)
5799         {
5800             /* fill cache line with next memory */
5801             cache_prefetch_next ((__m128i *)src);
5802             cache_prefetch_next ((__m128i *)dst);
5803             cache_prefetch_next ((__m128i *)mask);
5804
5805             m = *(uint32_t *) mask;
5806
5807             if (m)
5808             {
5809                 xmm_src = load_128_unaligned ((__m128i*)src);
5810
5811                 if (m == 0xffffffff && is_opaque (xmm_src))
5812                 {
5813                     save_128_aligned ((__m128i *)dst, xmm_src);
5814                 }
5815                 else
5816                 {
5817                     xmm_dst = load_128_aligned ((__m128i *)dst);
5818
5819                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5820
5821                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5822                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5823                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5824
5825                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5826                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5827
5828                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5829                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5830
5831                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5832                 }
5833             }
5834
5835             src += 4;
5836             dst += 4;
5837             mask += 4;
5838             w -= 4;
5839         }
5840
5841         while (w)
5842         {
5843             uint32_t sa;
5844
5845             s = *src++;
5846             m = (uint32_t) *mask++;
5847             d = *dst;
5848
5849             sa = s >> 24;
5850
5851             if (m)
5852             {
5853                 if (sa == 0xff && m == 0xff)
5854                 {
5855                     *dst = s;
5856                 }
5857                 else
5858                 {
5859                     __m64 ms, md, ma, msa;
5860
5861                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5862                     ms = unpack_32_1x64 (s);
5863                     md = unpack_32_1x64 (d);
5864
5865                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5866
5867                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5868                 }
5869             }
5870
5871             dst++;
5872             w--;
5873         }
5874     }
5875
5876     _mm_empty ();
5877 }
5878
5879 static void
5880 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5881                                     pixman_op_t              op,
5882                                     pixman_image_t *         src_image,
5883                                     pixman_image_t *         mask_image,
5884                                     pixman_image_t *         dst_image,
5885                                     int32_t                  src_x,
5886                                     int32_t                  src_y,
5887                                     int32_t                  mask_x,
5888                                     int32_t                  mask_y,
5889                                     int32_t                  dest_x,
5890                                     int32_t                  dest_y,
5891                                     int32_t                  width,
5892                                     int32_t                  height)
5893 {
5894     uint32_t src;
5895     uint32_t    *dst_line, *dst;
5896     __m128i xmm_src;
5897     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5898     __m128i xmm_dsta_hi, xmm_dsta_lo;
5899     int dst_stride;
5900     int32_t w;
5901
5902     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5903
5904     if (src == 0)
5905         return;
5906
5907     PIXMAN_IMAGE_GET_LINE (
5908         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5909
5910     xmm_src = expand_pixel_32_1x128 (src);
5911
5912     while (height--)
5913     {
5914         dst = dst_line;
5915
5916         /* call prefetch hint to optimize cache load*/
5917         cache_prefetch ((__m128i*)dst);
5918
5919         dst_line += dst_stride;
5920         w = width;
5921
5922         while (w && (unsigned long)dst & 15)
5923         {
5924             __m64 vd;
5925
5926             vd = unpack_32_1x64 (*dst);
5927
5928             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5929                                             _mm_movepi64_pi64 (xmm_src)));
5930             w--;
5931             dst++;
5932         }
5933
5934         cache_prefetch ((__m128i*)dst);
5935
5936         while (w >= 4)
5937         {
5938             __m128i tmp_lo, tmp_hi;
5939
5940             /* fill cache line with next memory */
5941             cache_prefetch_next ((__m128i*)(dst + 4));
5942
5943             xmm_dst = load_128_aligned ((__m128i*)dst);
5944
5945             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5946             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5947
5948             tmp_lo = xmm_src;
5949             tmp_hi = xmm_src;
5950
5951             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5952                         &xmm_dsta_lo, &xmm_dsta_hi,
5953                         &tmp_lo, &tmp_hi);
5954
5955             save_128_aligned (
5956                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5957
5958             w -= 4;
5959             dst += 4;
5960         }
5961
5962         while (w)
5963         {
5964             __m64 vd;
5965
5966             vd = unpack_32_1x64 (*dst);
5967
5968             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5969                                             _mm_movepi64_pi64 (xmm_src)));
5970             w--;
5971             dst++;
5972         }
5973
5974     }
5975
5976     _mm_empty ();
5977 }
5978
5979 static void
5980 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5981                                     pixman_op_t              op,
5982                                     pixman_image_t *         src_image,
5983                                     pixman_image_t *         mask_image,
5984                                     pixman_image_t *         dst_image,
5985                                     int32_t                  src_x,
5986                                     int32_t                  src_y,
5987                                     int32_t                  mask_x,
5988                                     int32_t                  mask_y,
5989                                     int32_t                  dest_x,
5990                                     int32_t                  dest_y,
5991                                     int32_t                  width,
5992                                     int32_t                  height)
5993 {
5994     uint32_t    *src, *src_line, s;
5995     uint32_t    *dst, *dst_line, d;
5996     uint32_t    *mask, *mask_line;
5997     uint32_t    m;
5998     int src_stride, mask_stride, dst_stride;
5999     int32_t w;
6000
6001     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
6002     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6003     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
6004
6005     PIXMAN_IMAGE_GET_LINE (
6006         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
6007     PIXMAN_IMAGE_GET_LINE (
6008         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
6009     PIXMAN_IMAGE_GET_LINE (
6010         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
6011
6012     while (height--)
6013     {
6014         src = src_line;
6015         src_line += src_stride;
6016         dst = dst_line;
6017         dst_line += dst_stride;
6018         mask = mask_line;
6019         mask_line += mask_stride;
6020
6021         w = width;
6022
6023         /* call prefetch hint to optimize cache load*/
6024         cache_prefetch ((__m128i *)src);
6025         cache_prefetch ((__m128i *)dst);
6026         cache_prefetch ((__m128i *)mask);
6027
6028         while (w && (unsigned long)dst & 15)
6029         {
6030             uint32_t sa;
6031
6032             s = *src++;
6033             m = (*mask++) >> 24;
6034             d = *dst;
6035
6036             sa = s >> 24;
6037
6038             if (m)
6039             {
6040                 if (sa == 0xff && m == 0xff)
6041                 {
6042                     *dst = s;
6043                 }
6044                 else
6045                 {
6046                     __m64 ms, md, ma, msa;
6047
6048                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
6049                     ms = unpack_32_1x64 (s);
6050                     md = unpack_32_1x64 (d);
6051
6052                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
6053
6054                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
6055                 }
6056             }
6057
6058             dst++;
6059             w--;
6060         }
6061
6062         /* call prefetch hint to optimize cache load*/
6063         cache_prefetch ((__m128i *)src);
6064         cache_prefetch ((__m128i *)dst);
6065         cache_prefetch ((__m128i *)mask);
6066
6067         while (w >= 4)
6068         {
6069             /* fill cache line with next memory */
6070             cache_prefetch_next ((__m128i *)src);
6071             cache_prefetch_next ((__m128i *)dst);
6072             cache_prefetch_next ((__m128i *)mask);
6073
6074             xmm_mask = load_128_unaligned ((__m128i*)mask);
6075
6076             if (!is_transparent (xmm_mask))
6077             {
6078                 xmm_src = load_128_unaligned ((__m128i*)src);
6079
6080                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
6081                 {
6082                     save_128_aligned ((__m128i *)dst, xmm_src);
6083                 }
6084                 else
6085                 {
6086                     xmm_dst = load_128_aligned ((__m128i *)dst);
6087
6088                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6089                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
6090                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6091
6092                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
6093                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
6094
6095                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
6096                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
6097
6098                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6099                 }
6100             }
6101
6102             src += 4;
6103             dst += 4;
6104             mask += 4;
6105             w -= 4;
6106         }
6107
6108         while (w)
6109         {
6110             uint32_t sa;
6111
6112             s = *src++;
6113             m = (*mask++) >> 24;
6114             d = *dst;
6115
6116             sa = s >> 24;
6117
6118             if (m)
6119             {
6120                 if (sa == 0xff && m == 0xff)
6121                 {
6122                     *dst = s;
6123                 }
6124                 else
6125                 {
6126                     __m64 ms, md, ma, msa;
6127
6128                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
6129                     ms = unpack_32_1x64 (s);
6130                     md = unpack_32_1x64 (d);
6131
6132                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
6133
6134                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
6135                 }
6136             }
6137
6138             dst++;
6139             w--;
6140         }
6141     }
6142
6143     _mm_empty ();
6144 }
6145
6146 static const pixman_fast_path_t sse2_fast_paths[] =
6147 {
6148     /* PIXMAN_OP_OVER */
6149     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6150     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6151     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6152     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6153     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6154     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6155     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6156     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6157     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6158     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6159     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6160     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6161     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6162     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6163     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6164     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6165     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6166     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6167     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6168     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6169     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6170     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6171     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6172     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6173     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6174     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6175     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6176     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6177     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6178     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6179     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6180     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6181     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6182     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6183     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6184     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6185     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6186     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6187     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6188     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6189     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6190     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6191     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6192     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6193     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6194     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6195
6196     /* PIXMAN_OP_OVER_REVERSE */
6197     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6198     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6199
6200     /* PIXMAN_OP_ADD */
6201     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6202     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
6203     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6204     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6205     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6206
6207     /* PIXMAN_OP_SRC */
6208     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6209     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6210     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6211     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6212     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6213     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6214     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6215     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6216     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6217     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6218     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6219     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6220     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6221     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6222
6223     /* PIXMAN_OP_IN */
6224     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6225     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6226
6227     { PIXMAN_OP_NONE },
6228 };
6229
6230 static pixman_bool_t
6231 sse2_blt (pixman_implementation_t *imp,
6232           uint32_t *               src_bits,
6233           uint32_t *               dst_bits,
6234           int                      src_stride,
6235           int                      dst_stride,
6236           int                      src_bpp,
6237           int                      dst_bpp,
6238           int                      src_x,
6239           int                      src_y,
6240           int                      dst_x,
6241           int                      dst_y,
6242           int                      width,
6243           int                      height)
6244 {
6245     if (!pixman_blt_sse2 (
6246             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6247             src_x, src_y, dst_x, dst_y, width, height))
6248
6249     {
6250         return _pixman_implementation_blt (
6251             imp->delegate,
6252             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6253             src_x, src_y, dst_x, dst_y, width, height);
6254     }
6255
6256     return TRUE;
6257 }
6258
6259 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6260 __attribute__((__force_align_arg_pointer__))
6261 #endif
6262 static pixman_bool_t
6263 sse2_fill (pixman_implementation_t *imp,
6264            uint32_t *               bits,
6265            int                      stride,
6266            int                      bpp,
6267            int                      x,
6268            int                      y,
6269            int                      width,
6270            int                      height,
6271            uint32_t xor)
6272 {
6273     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
6274     {
6275         return _pixman_implementation_fill (
6276             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
6277     }
6278
6279     return TRUE;
6280 }
6281
6282 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6283 __attribute__((__force_align_arg_pointer__))
6284 #endif
6285 pixman_implementation_t *
6286 _pixman_implementation_create_sse2 (void)
6287 {
6288 #ifdef USE_MMX
6289     pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
6290 #else
6291     pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
6292 #endif
6293     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6294
6295     /* SSE2 constants */
6296     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6297     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6298     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6299     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6300     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6301     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6302     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6303     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6304     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6305     mask_0080 = create_mask_16_128 (0x0080);
6306     mask_00ff = create_mask_16_128 (0x00ff);
6307     mask_0101 = create_mask_16_128 (0x0101);
6308     mask_ffff = create_mask_16_128 (0xffff);
6309     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6310     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6311
6312     /* MMX constants */
6313     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
6314     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
6315
6316     mask_x0080 = create_mask_16_64 (0x0080);
6317     mask_x00ff = create_mask_16_64 (0x00ff);
6318     mask_x0101 = create_mask_16_64 (0x0101);
6319     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
6320
6321     _mm_empty ();
6322
6323     /* Set up function pointers */
6324
6325     /* SSE code patch for fbcompose.c */
6326     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6327     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6328     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6329     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6330     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6331     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6332     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6333     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6334     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6335     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6336
6337     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6338
6339     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6340     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6341     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6342     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6343     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6344     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6345     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6346     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6347     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6348     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6349     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6350
6351     imp->blt = sse2_blt;
6352     imp->fill = sse2_fill;
6353
6354     return imp;
6355 }
6356
6357 #endif /* USE_SSE2 */