Add support for 8bpp to pixman_fill_sse2()
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41  * the pixman-x64-mmx-emulation.h file contains
42  * implementations of those MMX intrinsics that
43  * are used in the SSE2 implementation.
44  */
45 #   include "pixman-x64-mmx-emulation.h"
46 #endif
47
48 #ifdef USE_SSE2
49
50 /* --------------------------------------------------------------------
51  * Locals
52  */
53
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
58
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
61
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
68
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
75
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
78
79 /* ----------------------------------------------------------------------
80  * SSE2 Inlines
81  */
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
84 {
85     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
86 }
87
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
90 {
91     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
93 }
94
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
97 {
98     __m128i r, g, b, rb, t;
99
100     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
103
104     rb = _mm_or_si128 (r, b);
105     t  = _mm_and_si128 (rb, mask_565_fix_rb);
106     t  = _mm_srli_epi32 (t, 5);
107     rb = _mm_or_si128 (rb, t);
108
109     t  = _mm_and_si128 (g, mask_565_fix_g);
110     t  = _mm_srli_epi32 (t, 6);
111     g  = _mm_or_si128 (g, t);
112
113     return _mm_or_si128 (rb, g);
114 }
115
116 static force_inline void
117 unpack_565_128_4x128 (__m128i  data,
118                       __m128i* data0,
119                       __m128i* data1,
120                       __m128i* data2,
121                       __m128i* data3)
122 {
123     __m128i lo, hi;
124
125     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
127
128     lo = unpack_565_to_8888 (lo);
129     hi = unpack_565_to_8888 (hi);
130
131     unpack_128_2x128 (lo, data0, data1);
132     unpack_128_2x128 (hi, data2, data3);
133 }
134
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
137 {
138     return (uint16_t) (((pixel >> 8) & 0xf800) |
139                        ((pixel >> 5) & 0x07e0) |
140                        ((pixel >> 3) & 0x001f));
141 }
142
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
145 {
146     return _mm_packus_epi16 (lo, hi);
147 }
148
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
151 {
152     __m128i data;
153     __m128i r, g1, g2, b;
154
155     data = pack_2x128_128 (lo, hi);
156
157     r  = _mm_and_si128 (data, mask_565_r);
158     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
161
162     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
163 }
164
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
167 {
168     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169                              pack_565_2x128_128 (*xmm2, *xmm3));
170 }
171
172 static force_inline int
173 is_opaque (__m128i x)
174 {
175     __m128i ffs = _mm_cmpeq_epi8 (x, x);
176
177     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
178 }
179
180 static force_inline int
181 is_zero (__m128i x)
182 {
183     return _mm_movemask_epi8 (
184         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
185 }
186
187 static force_inline int
188 is_transparent (__m128i x)
189 {
190     return (_mm_movemask_epi8 (
191                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
192 }
193
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
196 {
197     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
198 }
199
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
202 {
203     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204                                                      _MM_SHUFFLE (3, 3, 3, 3)),
205                                 _MM_SHUFFLE (3, 3, 3, 3));
206 }
207
208 static force_inline void
209 expand_alpha_2x128 (__m128i  data_lo,
210                     __m128i  data_hi,
211                     __m128i* alpha_lo,
212                     __m128i* alpha_hi)
213 {
214     __m128i lo, hi;
215
216     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
218
219     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
221 }
222
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i  data_lo,
225                         __m128i  data_hi,
226                         __m128i* alpha_lo,
227                         __m128i* alpha_hi)
228 {
229     __m128i lo, hi;
230
231     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
235 }
236
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
239                     __m128i* data_hi,
240                     __m128i* alpha_lo,
241                     __m128i* alpha_hi,
242                     __m128i* ret_lo,
243                     __m128i* ret_hi)
244 {
245     __m128i lo, hi;
246
247     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249     lo = _mm_adds_epu16 (lo, mask_0080);
250     hi = _mm_adds_epu16 (hi, mask_0080);
251     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
253 }
254
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
257                         __m128i* src_hi,
258                         __m128i* alpha_dst_lo,
259                         __m128i* alpha_dst_hi,
260                         __m128i* dst_lo,
261                         __m128i* dst_hi,
262                         __m128i* alpha_src_lo,
263                         __m128i* alpha_src_hi,
264                         __m128i* ret_lo,
265                         __m128i* ret_hi)
266 {
267     __m128i t1_lo, t1_hi;
268     __m128i t2_lo, t2_hi;
269
270     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
272
273     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
275 }
276
277 static force_inline void
278 negate_2x128 (__m128i  data_lo,
279               __m128i  data_hi,
280               __m128i* neg_lo,
281               __m128i* neg_hi)
282 {
283     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
285 }
286
287 static force_inline void
288 invert_colors_2x128 (__m128i  data_lo,
289                      __m128i  data_hi,
290                      __m128i* inv_lo,
291                      __m128i* inv_hi)
292 {
293     __m128i lo, hi;
294
295     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
299 }
300
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
303             __m128i* src_hi,
304             __m128i* alpha_lo,
305             __m128i* alpha_hi,
306             __m128i* dst_lo,
307             __m128i* dst_hi)
308 {
309     __m128i t1, t2;
310
311     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
312
313     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
314
315     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
317 }
318
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i  src_lo,
321                         __m128i  src_hi,
322                         __m128i* dst_lo,
323                         __m128i* dst_hi)
324 {
325     __m128i lo, hi;
326     __m128i alpha_lo, alpha_hi;
327
328     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
329
330     lo = _mm_or_si128 (alpha_lo, mask_alpha);
331     hi = _mm_or_si128 (alpha_hi, mask_alpha);
332
333     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
334
335     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
336
337     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
338 }
339
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
342                __m128i* src_hi,
343                __m128i* alpha_lo,
344                __m128i* alpha_hi,
345                __m128i* mask_lo,
346                __m128i* mask_hi,
347                __m128i* dst_lo,
348                __m128i* dst_hi)
349 {
350     __m128i s_lo, s_hi;
351     __m128i a_lo, a_hi;
352
353     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
355
356     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
357 }
358
359 static force_inline void
360 cache_prefetch (__m128i* addr)
361 {
362     _mm_prefetch ((void const*)addr, _MM_HINT_T0);
363 }
364
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
367 {
368     _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
369 }
370
371 /* prefetching NULL is very slow on some systems. don't do that. */
372
373 static force_inline void
374 maybe_prefetch (__m128i* addr)
375 {
376     if (addr)
377         cache_prefetch (addr);
378 }
379
380 static force_inline void
381 maybe_prefetch_next (__m128i* addr)
382 {
383     if (addr)
384         cache_prefetch_next (addr);
385 }
386
387 /* load 4 pixels from a 16-byte boundary aligned address */
388 static force_inline __m128i
389 load_128_aligned (__m128i* src)
390 {
391     return _mm_load_si128 (src);
392 }
393
394 /* load 4 pixels from a unaligned address */
395 static force_inline __m128i
396 load_128_unaligned (const __m128i* src)
397 {
398     return _mm_loadu_si128 (src);
399 }
400
401 /* save 4 pixels using Write Combining memory on a 16-byte
402  * boundary aligned address
403  */
404 static force_inline void
405 save_128_write_combining (__m128i* dst,
406                           __m128i  data)
407 {
408     _mm_stream_si128 (dst, data);
409 }
410
411 /* save 4 pixels on a 16-byte boundary aligned address */
412 static force_inline void
413 save_128_aligned (__m128i* dst,
414                   __m128i  data)
415 {
416     _mm_store_si128 (dst, data);
417 }
418
419 /* save 4 pixels on a unaligned address */
420 static force_inline void
421 save_128_unaligned (__m128i* dst,
422                     __m128i  data)
423 {
424     _mm_storeu_si128 (dst, data);
425 }
426
427 /* ------------------------------------------------------------------
428  * MMX inlines
429  */
430
431 static force_inline __m64
432 load_32_1x64 (uint32_t data)
433 {
434     return _mm_cvtsi32_si64 (data);
435 }
436
437 static force_inline __m64
438 unpack_32_1x64 (uint32_t data)
439 {
440     return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
441 }
442
443 static force_inline __m64
444 expand_alpha_1x64 (__m64 data)
445 {
446     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
447 }
448
449 static force_inline __m64
450 expand_alpha_rev_1x64 (__m64 data)
451 {
452     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
453 }
454
455 static force_inline __m64
456 expand_pixel_8_1x64 (uint8_t data)
457 {
458     return _mm_shuffle_pi16 (
459         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
460 }
461
462 static force_inline __m64
463 pix_multiply_1x64 (__m64 data,
464                    __m64 alpha)
465 {
466     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
467                                           mask_x0080),
468                            mask_x0101);
469 }
470
471 static force_inline __m64
472 pix_add_multiply_1x64 (__m64* src,
473                        __m64* alpha_dst,
474                        __m64* dst,
475                        __m64* alpha_src)
476 {
477     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
478     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
479
480     return _mm_adds_pu8 (t1, t2);
481 }
482
483 static force_inline __m64
484 negate_1x64 (__m64 data)
485 {
486     return _mm_xor_si64 (data, mask_x00ff);
487 }
488
489 static force_inline __m64
490 invert_colors_1x64 (__m64 data)
491 {
492     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
493 }
494
495 static force_inline __m64
496 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
497 {
498     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
499 }
500
501 static force_inline __m64
502 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
503 {
504     return over_1x64 (pix_multiply_1x64 (*src, *mask),
505                       pix_multiply_1x64 (*alpha, *mask),
506                       *dst);
507 }
508
509 static force_inline __m64
510 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
511 {
512     __m64 alpha = expand_alpha_1x64 (src);
513
514     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
515                                          _mm_or_si64 (alpha, mask_x_alpha)),
516                       alpha,
517                       dst);
518 }
519
520 static force_inline uint32_t
521 pack_1x64_32 (__m64 data)
522 {
523     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
524 }
525
526 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
527  *
528  *    00RR00GG00BB
529  *
530  * --- Expanding 565 in the low word ---
531  *
532  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
533  * m = m & (01f0003f001f);
534  * m = m * (008404100840);
535  * m = m >> 8;
536  *
537  * Note the trick here - the top word is shifted by another nibble to
538  * avoid it bumping into the middle word
539  */
540 static force_inline __m64
541 expand565_16_1x64 (uint16_t pixel)
542 {
543     __m64 p;
544     __m64 t1, t2;
545
546     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
547
548     t1 = _mm_slli_si64 (p, 36 - 11);
549     t2 = _mm_slli_si64 (p, 16 - 5);
550
551     p = _mm_or_si64 (t1, p);
552     p = _mm_or_si64 (t2, p);
553     p = _mm_and_si64 (p, mask_x565_rgb);
554     p = _mm_mullo_pi16 (p, mask_x565_unpack);
555
556     return _mm_srli_pi16 (p, 8);
557 }
558
559 /* ----------------------------------------------------------------------------
560  * Compose Core transformations
561  */
562 static force_inline uint32_t
563 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
564 {
565     uint8_t a;
566     __m64 ms;
567
568     a = src >> 24;
569
570     if (a == 0xff)
571     {
572         return src;
573     }
574     else if (src)
575     {
576         ms = unpack_32_1x64 (src);
577         return pack_1x64_32 (
578             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
579     }
580
581     return dst;
582 }
583
584 static force_inline uint32_t
585 combine1 (const uint32_t *ps, const uint32_t *pm)
586 {
587     uint32_t s = *ps;
588
589     if (pm)
590     {
591         __m64 ms, mm;
592
593         mm = unpack_32_1x64 (*pm);
594         mm = expand_alpha_1x64 (mm);
595
596         ms = unpack_32_1x64 (s);
597         ms = pix_multiply_1x64 (ms, mm);
598
599         s = pack_1x64_32 (ms);
600     }
601
602     return s;
603 }
604
605 static force_inline __m128i
606 combine4 (const __m128i *ps, const __m128i *pm)
607 {
608     __m128i xmm_src_lo, xmm_src_hi;
609     __m128i xmm_msk_lo, xmm_msk_hi;
610     __m128i s;
611
612     if (pm)
613     {
614         xmm_msk_lo = load_128_unaligned (pm);
615
616         if (is_transparent (xmm_msk_lo))
617             return _mm_setzero_si128 ();
618     }
619
620     s = load_128_unaligned (ps);
621
622     if (pm)
623     {
624         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
625         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
626
627         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
628
629         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
630                             &xmm_msk_lo, &xmm_msk_hi,
631                             &xmm_src_lo, &xmm_src_hi);
632
633         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
634     }
635
636     return s;
637 }
638
639 static force_inline void
640 core_combine_over_u_sse2 (uint32_t*       pd,
641                           const uint32_t* ps,
642                           const uint32_t* pm,
643                           int             w)
644 {
645     uint32_t s, d;
646
647     __m128i xmm_dst_lo, xmm_dst_hi;
648     __m128i xmm_src_lo, xmm_src_hi;
649     __m128i xmm_alpha_lo, xmm_alpha_hi;
650
651     /* call prefetch hint to optimize cache load*/
652     cache_prefetch ((__m128i*)ps);
653     cache_prefetch ((__m128i*)pd);
654     maybe_prefetch ((__m128i*)pm);
655
656     /* Align dst on a 16-byte boundary */
657     while (w && ((unsigned long)pd & 15))
658     {
659         d = *pd;
660         s = combine1 (ps, pm);
661
662         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
663         ps++;
664         if (pm)
665             pm++;
666         w--;
667     }
668
669     /* call prefetch hint to optimize cache load*/
670     cache_prefetch ((__m128i*)ps);
671     cache_prefetch ((__m128i*)pd);
672     maybe_prefetch ((__m128i*)pm);
673
674     while (w >= 4)
675     {
676         /* fill cache line with next memory */
677         cache_prefetch_next ((__m128i*)ps);
678         cache_prefetch_next ((__m128i*)pd);
679         maybe_prefetch_next ((__m128i*)pm);
680
681         /* I'm loading unaligned because I'm not sure about
682          * the address alignment.
683          */
684         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
685
686         if (is_opaque (xmm_src_hi))
687         {
688             save_128_aligned ((__m128i*)pd, xmm_src_hi);
689         }
690         else if (!is_zero (xmm_src_hi))
691         {
692             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
693
694             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
695             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
696
697             expand_alpha_2x128 (
698                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
699
700             over_2x128 (&xmm_src_lo, &xmm_src_hi,
701                         &xmm_alpha_lo, &xmm_alpha_hi,
702                         &xmm_dst_lo, &xmm_dst_hi);
703
704             /* rebuid the 4 pixel data and save*/
705             save_128_aligned ((__m128i*)pd,
706                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
707         }
708
709         w -= 4;
710         ps += 4;
711         pd += 4;
712         if (pm)
713             pm += 4;
714     }
715
716     while (w)
717     {
718         d = *pd;
719         s = combine1 (ps, pm);
720
721         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
722         ps++;
723         if (pm)
724             pm++;
725
726         w--;
727     }
728 }
729
730 static force_inline void
731 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
732                                   const uint32_t* ps,
733                                   const uint32_t* pm,
734                                   int             w)
735 {
736     uint32_t s, d;
737
738     __m128i xmm_dst_lo, xmm_dst_hi;
739     __m128i xmm_src_lo, xmm_src_hi;
740     __m128i xmm_alpha_lo, xmm_alpha_hi;
741
742     /* call prefetch hint to optimize cache load*/
743     cache_prefetch ((__m128i*)ps);
744     cache_prefetch ((__m128i*)pd);
745     maybe_prefetch ((__m128i*)pm);
746
747     /* Align dst on a 16-byte boundary */
748     while (w &&
749            ((unsigned long)pd & 15))
750     {
751         d = *pd;
752         s = combine1 (ps, pm);
753
754         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
755         w--;
756         ps++;
757         if (pm)
758             pm++;
759     }
760
761     /* call prefetch hint to optimize cache load*/
762     cache_prefetch ((__m128i*)ps);
763     cache_prefetch ((__m128i*)pd);
764     maybe_prefetch ((__m128i*)pm);
765
766     while (w >= 4)
767     {
768         /* fill cache line with next memory */
769         cache_prefetch_next ((__m128i*)ps);
770         cache_prefetch_next ((__m128i*)pd);
771         maybe_prefetch_next ((__m128i*)pm);
772
773         /* I'm loading unaligned because I'm not sure
774          * about the address alignment.
775          */
776         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
777         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
778
779         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
780         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
781
782         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
783                             &xmm_alpha_lo, &xmm_alpha_hi);
784
785         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
786                     &xmm_alpha_lo, &xmm_alpha_hi,
787                     &xmm_src_lo, &xmm_src_hi);
788
789         /* rebuid the 4 pixel data and save*/
790         save_128_aligned ((__m128i*)pd,
791                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
792
793         w -= 4;
794         ps += 4;
795         pd += 4;
796
797         if (pm)
798             pm += 4;
799     }
800
801     while (w)
802     {
803         d = *pd;
804         s = combine1 (ps, pm);
805
806         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
807         ps++;
808         w--;
809         if (pm)
810             pm++;
811     }
812 }
813
814 static force_inline uint32_t
815 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
816 {
817     uint32_t maska = src >> 24;
818
819     if (maska == 0)
820     {
821         return 0;
822     }
823     else if (maska != 0xff)
824     {
825         return pack_1x64_32 (
826             pix_multiply_1x64 (unpack_32_1x64 (dst),
827                                expand_alpha_1x64 (unpack_32_1x64 (src))));
828     }
829
830     return dst;
831 }
832
833 static force_inline void
834 core_combine_in_u_sse2 (uint32_t*       pd,
835                         const uint32_t* ps,
836                         const uint32_t* pm,
837                         int             w)
838 {
839     uint32_t s, d;
840
841     __m128i xmm_src_lo, xmm_src_hi;
842     __m128i xmm_dst_lo, xmm_dst_hi;
843
844     /* call prefetch hint to optimize cache load*/
845     cache_prefetch ((__m128i*)ps);
846     cache_prefetch ((__m128i*)pd);
847     maybe_prefetch ((__m128i*)pm);
848
849     while (w && ((unsigned long) pd & 15))
850     {
851         s = combine1 (ps, pm);
852         d = *pd;
853
854         *pd++ = core_combine_in_u_pixelsse2 (d, s);
855         w--;
856         ps++;
857         if (pm)
858             pm++;
859     }
860
861     /* call prefetch hint to optimize cache load*/
862     cache_prefetch ((__m128i*)ps);
863     cache_prefetch ((__m128i*)pd);
864     maybe_prefetch ((__m128i*)pm);
865
866     while (w >= 4)
867     {
868         /* fill cache line with next memory */
869         cache_prefetch_next ((__m128i*)ps);
870         cache_prefetch_next ((__m128i*)pd);
871         maybe_prefetch_next ((__m128i*)pm);
872
873         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
874         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
875
876         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
877         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
878
879         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
880         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
881                             &xmm_dst_lo, &xmm_dst_hi,
882                             &xmm_dst_lo, &xmm_dst_hi);
883
884         save_128_aligned ((__m128i*)pd,
885                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
886
887         ps += 4;
888         pd += 4;
889         w -= 4;
890         if (pm)
891             pm += 4;
892     }
893
894     while (w)
895     {
896         s = combine1 (ps, pm);
897         d = *pd;
898
899         *pd++ = core_combine_in_u_pixelsse2 (d, s);
900         w--;
901         ps++;
902         if (pm)
903             pm++;
904     }
905 }
906
907 static force_inline void
908 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
909                                 const uint32_t* ps,
910                                 const uint32_t *pm,
911                                 int             w)
912 {
913     uint32_t s, d;
914
915     __m128i xmm_src_lo, xmm_src_hi;
916     __m128i xmm_dst_lo, xmm_dst_hi;
917
918     /* call prefetch hint to optimize cache load*/
919     cache_prefetch ((__m128i*)ps);
920     cache_prefetch ((__m128i*)pd);
921     maybe_prefetch ((__m128i*)pm);
922
923     while (w && ((unsigned long) pd & 15))
924     {
925         s = combine1 (ps, pm);
926         d = *pd;
927
928         *pd++ = core_combine_in_u_pixelsse2 (s, d);
929         ps++;
930         w--;
931         if (pm)
932             pm++;
933     }
934
935     /* call prefetch hint to optimize cache load*/
936     cache_prefetch ((__m128i*)ps);
937     cache_prefetch ((__m128i*)pd);
938     maybe_prefetch ((__m128i*)pm);
939
940     while (w >= 4)
941     {
942         /* fill cache line with next memory */
943         cache_prefetch_next ((__m128i*)ps);
944         cache_prefetch_next ((__m128i*)pd);
945         maybe_prefetch_next ((__m128i*)pm);
946
947         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
948         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
949
950         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
951         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
952
953         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
954         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
955                             &xmm_src_lo, &xmm_src_hi,
956                             &xmm_dst_lo, &xmm_dst_hi);
957
958         save_128_aligned (
959             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
960
961         ps += 4;
962         pd += 4;
963         w -= 4;
964         if (pm)
965             pm += 4;
966     }
967
968     while (w)
969     {
970         s = combine1 (ps, pm);
971         d = *pd;
972
973         *pd++ = core_combine_in_u_pixelsse2 (s, d);
974         w--;
975         ps++;
976         if (pm)
977             pm++;
978     }
979 }
980
981 static force_inline void
982 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
983                                  const uint32_t* ps,
984                                  const uint32_t* pm,
985                                  int             w)
986 {
987     /* call prefetch hint to optimize cache load*/
988     cache_prefetch ((__m128i*)ps);
989     cache_prefetch ((__m128i*)pd);
990     maybe_prefetch ((__m128i*)pm);
991
992     while (w && ((unsigned long) pd & 15))
993     {
994         uint32_t s = combine1 (ps, pm);
995         uint32_t d = *pd;
996
997         *pd++ = pack_1x64_32 (
998             pix_multiply_1x64 (
999                 unpack_32_1x64 (d), negate_1x64 (
1000                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1001
1002         if (pm)
1003             pm++;
1004         ps++;
1005         w--;
1006     }
1007
1008     /* call prefetch hint to optimize cache load*/
1009     cache_prefetch ((__m128i*)ps);
1010     cache_prefetch ((__m128i*)pd);
1011     maybe_prefetch ((__m128i*)pm);
1012
1013     while (w >= 4)
1014     {
1015         __m128i xmm_src_lo, xmm_src_hi;
1016         __m128i xmm_dst_lo, xmm_dst_hi;
1017
1018         /* fill cache line with next memory */
1019         cache_prefetch_next ((__m128i*)ps);
1020         cache_prefetch_next ((__m128i*)pd);
1021         maybe_prefetch_next ((__m128i*)pm);
1022
1023         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1024         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1025
1026         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1027         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1028
1029         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1030         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1031
1032         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1033                             &xmm_src_lo, &xmm_src_hi,
1034                             &xmm_dst_lo, &xmm_dst_hi);
1035
1036         save_128_aligned (
1037             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1038
1039         ps += 4;
1040         pd += 4;
1041         if (pm)
1042             pm += 4;
1043
1044         w -= 4;
1045     }
1046
1047     while (w)
1048     {
1049         uint32_t s = combine1 (ps, pm);
1050         uint32_t d = *pd;
1051
1052         *pd++ = pack_1x64_32 (
1053             pix_multiply_1x64 (
1054                 unpack_32_1x64 (d), negate_1x64 (
1055                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1056         ps++;
1057         if (pm)
1058             pm++;
1059         w--;
1060     }
1061 }
1062
1063 static force_inline void
1064 core_combine_out_u_sse2 (uint32_t*       pd,
1065                          const uint32_t* ps,
1066                          const uint32_t* pm,
1067                          int             w)
1068 {
1069     /* call prefetch hint to optimize cache load*/
1070     cache_prefetch ((__m128i*)ps);
1071     cache_prefetch ((__m128i*)pd);
1072     maybe_prefetch ((__m128i*)pm);
1073
1074     while (w && ((unsigned long) pd & 15))
1075     {
1076         uint32_t s = combine1 (ps, pm);
1077         uint32_t d = *pd;
1078
1079         *pd++ = pack_1x64_32 (
1080             pix_multiply_1x64 (
1081                 unpack_32_1x64 (s), negate_1x64 (
1082                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1083         w--;
1084         ps++;
1085         if (pm)
1086             pm++;
1087     }
1088
1089     /* call prefetch hint to optimize cache load*/
1090     cache_prefetch ((__m128i*)ps);
1091     cache_prefetch ((__m128i*)pd);
1092     maybe_prefetch ((__m128i*)pm);
1093
1094     while (w >= 4)
1095     {
1096         __m128i xmm_src_lo, xmm_src_hi;
1097         __m128i xmm_dst_lo, xmm_dst_hi;
1098
1099         /* fill cache line with next memory */
1100         cache_prefetch_next ((__m128i*)ps);
1101         cache_prefetch_next ((__m128i*)pd);
1102         maybe_prefetch_next ((__m128i*)pm);
1103
1104         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1105         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1106
1107         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1108         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1109
1110         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1111         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1112
1113         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1114                             &xmm_dst_lo, &xmm_dst_hi,
1115                             &xmm_dst_lo, &xmm_dst_hi);
1116
1117         save_128_aligned (
1118             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1119
1120         ps += 4;
1121         pd += 4;
1122         w -= 4;
1123         if (pm)
1124             pm += 4;
1125     }
1126
1127     while (w)
1128     {
1129         uint32_t s = combine1 (ps, pm);
1130         uint32_t d = *pd;
1131
1132         *pd++ = pack_1x64_32 (
1133             pix_multiply_1x64 (
1134                 unpack_32_1x64 (s), negate_1x64 (
1135                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1136         w--;
1137         ps++;
1138         if (pm)
1139             pm++;
1140     }
1141 }
1142
1143 static force_inline uint32_t
1144 core_combine_atop_u_pixel_sse2 (uint32_t src,
1145                                 uint32_t dst)
1146 {
1147     __m64 s = unpack_32_1x64 (src);
1148     __m64 d = unpack_32_1x64 (dst);
1149
1150     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1151     __m64 da = expand_alpha_1x64 (d);
1152
1153     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1154 }
1155
1156 static force_inline void
1157 core_combine_atop_u_sse2 (uint32_t*       pd,
1158                           const uint32_t* ps,
1159                           const uint32_t* pm,
1160                           int             w)
1161 {
1162     uint32_t s, d;
1163
1164     __m128i xmm_src_lo, xmm_src_hi;
1165     __m128i xmm_dst_lo, xmm_dst_hi;
1166     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1167     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1168
1169     /* call prefetch hint to optimize cache load*/
1170     cache_prefetch ((__m128i*)ps);
1171     cache_prefetch ((__m128i*)pd);
1172     maybe_prefetch ((__m128i*)pm);
1173
1174     while (w && ((unsigned long) pd & 15))
1175     {
1176         s = combine1 (ps, pm);
1177         d = *pd;
1178
1179         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1180         w--;
1181         ps++;
1182         if (pm)
1183             pm++;
1184     }
1185
1186     /* call prefetch hint to optimize cache load*/
1187     cache_prefetch ((__m128i*)ps);
1188     cache_prefetch ((__m128i*)pd);
1189     maybe_prefetch ((__m128i*)pm);
1190
1191     while (w >= 4)
1192     {
1193         /* fill cache line with next memory */
1194         cache_prefetch_next ((__m128i*)ps);
1195         cache_prefetch_next ((__m128i*)pd);
1196         maybe_prefetch_next ((__m128i*)pm);
1197
1198         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1199         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1200
1201         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1202         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1203
1204         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1205                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1206         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1207                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1208
1209         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1210                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1211
1212         pix_add_multiply_2x128 (
1213             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1214             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1215             &xmm_dst_lo, &xmm_dst_hi);
1216
1217         save_128_aligned (
1218             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1219
1220         ps += 4;
1221         pd += 4;
1222         w -= 4;
1223         if (pm)
1224             pm += 4;
1225     }
1226
1227     while (w)
1228     {
1229         s = combine1 (ps, pm);
1230         d = *pd;
1231
1232         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1233         w--;
1234         ps++;
1235         if (pm)
1236             pm++;
1237     }
1238 }
1239
1240 static force_inline uint32_t
1241 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1242                                         uint32_t dst)
1243 {
1244     __m64 s = unpack_32_1x64 (src);
1245     __m64 d = unpack_32_1x64 (dst);
1246
1247     __m64 sa = expand_alpha_1x64 (s);
1248     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1249
1250     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1251 }
1252
1253 static force_inline void
1254 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1255                                   const uint32_t* ps,
1256                                   const uint32_t* pm,
1257                                   int             w)
1258 {
1259     uint32_t s, d;
1260
1261     __m128i xmm_src_lo, xmm_src_hi;
1262     __m128i xmm_dst_lo, xmm_dst_hi;
1263     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1264     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1265
1266     /* call prefetch hint to optimize cache load*/
1267     cache_prefetch ((__m128i*)ps);
1268     cache_prefetch ((__m128i*)pd);
1269     maybe_prefetch ((__m128i*)pm);
1270
1271     while (w && ((unsigned long) pd & 15))
1272     {
1273         s = combine1 (ps, pm);
1274         d = *pd;
1275
1276         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1277         ps++;
1278         w--;
1279         if (pm)
1280             pm++;
1281     }
1282
1283     /* call prefetch hint to optimize cache load*/
1284     cache_prefetch ((__m128i*)ps);
1285     cache_prefetch ((__m128i*)pd);
1286     maybe_prefetch ((__m128i*)pm);
1287
1288     while (w >= 4)
1289     {
1290         /* fill cache line with next memory */
1291         cache_prefetch_next ((__m128i*)ps);
1292         cache_prefetch_next ((__m128i*)pd);
1293         maybe_prefetch_next ((__m128i*)pm);
1294
1295         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1296         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1297
1298         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1299         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1300
1301         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1302                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1303         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1304                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1305
1306         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1307                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1308
1309         pix_add_multiply_2x128 (
1310             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1311             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1312             &xmm_dst_lo, &xmm_dst_hi);
1313
1314         save_128_aligned (
1315             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1316
1317         ps += 4;
1318         pd += 4;
1319         w -= 4;
1320         if (pm)
1321             pm += 4;
1322     }
1323
1324     while (w)
1325     {
1326         s = combine1 (ps, pm);
1327         d = *pd;
1328
1329         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1330         ps++;
1331         w--;
1332         if (pm)
1333             pm++;
1334     }
1335 }
1336
1337 static force_inline uint32_t
1338 core_combine_xor_u_pixel_sse2 (uint32_t src,
1339                                uint32_t dst)
1340 {
1341     __m64 s = unpack_32_1x64 (src);
1342     __m64 d = unpack_32_1x64 (dst);
1343
1344     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1345     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1346
1347     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1348 }
1349
1350 static force_inline void
1351 core_combine_xor_u_sse2 (uint32_t*       dst,
1352                          const uint32_t* src,
1353                          const uint32_t *mask,
1354                          int             width)
1355 {
1356     int w = width;
1357     uint32_t s, d;
1358     uint32_t* pd = dst;
1359     const uint32_t* ps = src;
1360     const uint32_t* pm = mask;
1361
1362     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1363     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1364     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1365     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1366
1367     /* call prefetch hint to optimize cache load*/
1368     cache_prefetch ((__m128i*)ps);
1369     cache_prefetch ((__m128i*)pd);
1370     maybe_prefetch ((__m128i*)pm);
1371
1372     while (w && ((unsigned long) pd & 15))
1373     {
1374         s = combine1 (ps, pm);
1375         d = *pd;
1376
1377         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1378         w--;
1379         ps++;
1380         if (pm)
1381             pm++;
1382     }
1383
1384     /* call prefetch hint to optimize cache load*/
1385     cache_prefetch ((__m128i*)ps);
1386     cache_prefetch ((__m128i*)pd);
1387     maybe_prefetch ((__m128i*)pm);
1388
1389     while (w >= 4)
1390     {
1391         /* fill cache line with next memory */
1392         cache_prefetch_next ((__m128i*)ps);
1393         cache_prefetch_next ((__m128i*)pd);
1394         maybe_prefetch_next ((__m128i*)pm);
1395
1396         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1397         xmm_dst = load_128_aligned ((__m128i*) pd);
1398
1399         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1400         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1401
1402         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1403                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1404         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1405                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1406
1407         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1408                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1409         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1410                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1411
1412         pix_add_multiply_2x128 (
1413             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1414             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1415             &xmm_dst_lo, &xmm_dst_hi);
1416
1417         save_128_aligned (
1418             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1419
1420         ps += 4;
1421         pd += 4;
1422         w -= 4;
1423         if (pm)
1424             pm += 4;
1425     }
1426
1427     while (w)
1428     {
1429         s = combine1 (ps, pm);
1430         d = *pd;
1431
1432         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1433         w--;
1434         ps++;
1435         if (pm)
1436             pm++;
1437     }
1438 }
1439
1440 static force_inline void
1441 core_combine_add_u_sse2 (uint32_t*       dst,
1442                          const uint32_t* src,
1443                          const uint32_t* mask,
1444                          int             width)
1445 {
1446     int w = width;
1447     uint32_t s, d;
1448     uint32_t* pd = dst;
1449     const uint32_t* ps = src;
1450     const uint32_t* pm = mask;
1451
1452     /* call prefetch hint to optimize cache load*/
1453     cache_prefetch ((__m128i*)ps);
1454     cache_prefetch ((__m128i*)pd);
1455     maybe_prefetch ((__m128i*)pm);
1456
1457     while (w && (unsigned long)pd & 15)
1458     {
1459         s = combine1 (ps, pm);
1460         d = *pd;
1461
1462         ps++;
1463         if (pm)
1464             pm++;
1465         *pd++ = _mm_cvtsi64_si32 (
1466             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1467         w--;
1468     }
1469
1470     /* call prefetch hint to optimize cache load*/
1471     cache_prefetch ((__m128i*)ps);
1472     cache_prefetch ((__m128i*)pd);
1473     maybe_prefetch ((__m128i*)pm);
1474
1475     while (w >= 4)
1476     {
1477         __m128i s;
1478
1479         /* fill cache line with next memory */
1480         cache_prefetch_next ((__m128i*)ps);
1481         cache_prefetch_next ((__m128i*)pd);
1482         maybe_prefetch_next ((__m128i*)pm);
1483
1484         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1485
1486         save_128_aligned (
1487             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1488
1489         pd += 4;
1490         ps += 4;
1491         if (pm)
1492             pm += 4;
1493         w -= 4;
1494     }
1495
1496     while (w--)
1497     {
1498         s = combine1 (ps, pm);
1499         d = *pd;
1500
1501         ps++;
1502         *pd++ = _mm_cvtsi64_si32 (
1503             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1504         if (pm)
1505             pm++;
1506     }
1507 }
1508
1509 static force_inline uint32_t
1510 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1511                                     uint32_t dst)
1512 {
1513     __m64 ms = unpack_32_1x64 (src);
1514     __m64 md = unpack_32_1x64 (dst);
1515     uint32_t sa = src >> 24;
1516     uint32_t da = ~dst >> 24;
1517
1518     if (sa > da)
1519     {
1520         ms = pix_multiply_1x64 (
1521             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1522     }
1523
1524     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1525 }
1526
1527 static force_inline void
1528 core_combine_saturate_u_sse2 (uint32_t *      pd,
1529                               const uint32_t *ps,
1530                               const uint32_t *pm,
1531                               int             w)
1532 {
1533     uint32_t s, d;
1534
1535     uint32_t pack_cmp;
1536     __m128i xmm_src, xmm_dst;
1537
1538     /* call prefetch hint to optimize cache load*/
1539     cache_prefetch ((__m128i*)ps);
1540     cache_prefetch ((__m128i*)pd);
1541     maybe_prefetch ((__m128i*)pm);
1542
1543     while (w && (unsigned long)pd & 15)
1544     {
1545         s = combine1 (ps, pm);
1546         d = *pd;
1547
1548         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1549         w--;
1550         ps++;
1551         if (pm)
1552             pm++;
1553     }
1554
1555     /* call prefetch hint to optimize cache load*/
1556     cache_prefetch ((__m128i*)ps);
1557     cache_prefetch ((__m128i*)pd);
1558     maybe_prefetch ((__m128i*)pm);
1559
1560     while (w >= 4)
1561     {
1562         /* fill cache line with next memory */
1563         cache_prefetch_next ((__m128i*)ps);
1564         cache_prefetch_next ((__m128i*)pd);
1565         maybe_prefetch_next ((__m128i*)pm);
1566
1567         xmm_dst = load_128_aligned  ((__m128i*)pd);
1568         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1569
1570         pack_cmp = _mm_movemask_epi8 (
1571             _mm_cmpgt_epi32 (
1572                 _mm_srli_epi32 (xmm_src, 24),
1573                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1574
1575         /* if some alpha src is grater than respective ~alpha dst */
1576         if (pack_cmp)
1577         {
1578             s = combine1 (ps++, pm);
1579             d = *pd;
1580             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1581             if (pm)
1582                 pm++;
1583
1584             s = combine1 (ps++, pm);
1585             d = *pd;
1586             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1587             if (pm)
1588                 pm++;
1589
1590             s = combine1 (ps++, pm);
1591             d = *pd;
1592             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1593             if (pm)
1594                 pm++;
1595
1596             s = combine1 (ps++, pm);
1597             d = *pd;
1598             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1599             if (pm)
1600                 pm++;
1601         }
1602         else
1603         {
1604             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1605
1606             pd += 4;
1607             ps += 4;
1608             if (pm)
1609                 pm += 4;
1610         }
1611
1612         w -= 4;
1613     }
1614
1615     while (w--)
1616     {
1617         s = combine1 (ps, pm);
1618         d = *pd;
1619
1620         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1621         ps++;
1622         if (pm)
1623             pm++;
1624     }
1625 }
1626
1627 static force_inline void
1628 core_combine_src_ca_sse2 (uint32_t*       pd,
1629                           const uint32_t* ps,
1630                           const uint32_t *pm,
1631                           int             w)
1632 {
1633     uint32_t s, m;
1634
1635     __m128i xmm_src_lo, xmm_src_hi;
1636     __m128i xmm_mask_lo, xmm_mask_hi;
1637     __m128i xmm_dst_lo, xmm_dst_hi;
1638
1639     /* call prefetch hint to optimize cache load*/
1640     cache_prefetch ((__m128i*)ps);
1641     cache_prefetch ((__m128i*)pd);
1642     cache_prefetch ((__m128i*)pm);
1643
1644     while (w && (unsigned long)pd & 15)
1645     {
1646         s = *ps++;
1647         m = *pm++;
1648         *pd++ = pack_1x64_32 (
1649             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1650         w--;
1651     }
1652
1653     /* call prefetch hint to optimize cache load*/
1654     cache_prefetch ((__m128i*)ps);
1655     cache_prefetch ((__m128i*)pd);
1656     cache_prefetch ((__m128i*)pm);
1657
1658     while (w >= 4)
1659     {
1660         /* fill cache line with next memory */
1661         cache_prefetch_next ((__m128i*)ps);
1662         cache_prefetch_next ((__m128i*)pd);
1663         cache_prefetch_next ((__m128i*)pm);
1664
1665         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1666         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1667
1668         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1669         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1670
1671         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1672                             &xmm_mask_lo, &xmm_mask_hi,
1673                             &xmm_dst_lo, &xmm_dst_hi);
1674
1675         save_128_aligned (
1676             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1677
1678         ps += 4;
1679         pd += 4;
1680         pm += 4;
1681         w -= 4;
1682     }
1683
1684     while (w)
1685     {
1686         s = *ps++;
1687         m = *pm++;
1688         *pd++ = pack_1x64_32 (
1689             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1690         w--;
1691     }
1692 }
1693
1694 static force_inline uint32_t
1695 core_combine_over_ca_pixel_sse2 (uint32_t src,
1696                                  uint32_t mask,
1697                                  uint32_t dst)
1698 {
1699     __m64 s = unpack_32_1x64 (src);
1700     __m64 expAlpha = expand_alpha_1x64 (s);
1701     __m64 unpk_mask = unpack_32_1x64 (mask);
1702     __m64 unpk_dst  = unpack_32_1x64 (dst);
1703
1704     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1705 }
1706
1707 static force_inline void
1708 core_combine_over_ca_sse2 (uint32_t*       pd,
1709                            const uint32_t* ps,
1710                            const uint32_t *pm,
1711                            int             w)
1712 {
1713     uint32_t s, m, d;
1714
1715     __m128i xmm_alpha_lo, xmm_alpha_hi;
1716     __m128i xmm_src_lo, xmm_src_hi;
1717     __m128i xmm_dst_lo, xmm_dst_hi;
1718     __m128i xmm_mask_lo, xmm_mask_hi;
1719
1720     /* call prefetch hint to optimize cache load*/
1721     cache_prefetch ((__m128i*)ps);
1722     cache_prefetch ((__m128i*)pd);
1723     cache_prefetch ((__m128i*)pm);
1724
1725     while (w && (unsigned long)pd & 15)
1726     {
1727         s = *ps++;
1728         m = *pm++;
1729         d = *pd;
1730
1731         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1732         w--;
1733     }
1734
1735     /* call prefetch hint to optimize cache load*/
1736     cache_prefetch ((__m128i*)ps);
1737     cache_prefetch ((__m128i*)pd);
1738     cache_prefetch ((__m128i*)pm);
1739
1740     while (w >= 4)
1741     {
1742         /* fill cache line with next memory */
1743         cache_prefetch_next ((__m128i*)ps);
1744         cache_prefetch_next ((__m128i*)pd);
1745         cache_prefetch_next ((__m128i*)pm);
1746
1747         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1748         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1749         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1750
1751         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1752         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1753         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1754
1755         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1756                             &xmm_alpha_lo, &xmm_alpha_hi);
1757
1758         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1759                        &xmm_alpha_lo, &xmm_alpha_hi,
1760                        &xmm_mask_lo, &xmm_mask_hi,
1761                        &xmm_dst_lo, &xmm_dst_hi);
1762
1763         save_128_aligned (
1764             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1765
1766         ps += 4;
1767         pd += 4;
1768         pm += 4;
1769         w -= 4;
1770     }
1771
1772     while (w)
1773     {
1774         s = *ps++;
1775         m = *pm++;
1776         d = *pd;
1777
1778         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1779         w--;
1780     }
1781 }
1782
1783 static force_inline uint32_t
1784 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1785                                          uint32_t mask,
1786                                          uint32_t dst)
1787 {
1788     __m64 d = unpack_32_1x64 (dst);
1789
1790     return pack_1x64_32 (
1791         over_1x64 (d, expand_alpha_1x64 (d),
1792                    pix_multiply_1x64 (unpack_32_1x64 (src),
1793                                       unpack_32_1x64 (mask))));
1794 }
1795
1796 static force_inline void
1797 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1798                                    const uint32_t* ps,
1799                                    const uint32_t *pm,
1800                                    int             w)
1801 {
1802     uint32_t s, m, d;
1803
1804     __m128i xmm_alpha_lo, xmm_alpha_hi;
1805     __m128i xmm_src_lo, xmm_src_hi;
1806     __m128i xmm_dst_lo, xmm_dst_hi;
1807     __m128i xmm_mask_lo, xmm_mask_hi;
1808
1809     /* call prefetch hint to optimize cache load*/
1810     cache_prefetch ((__m128i*)ps);
1811     cache_prefetch ((__m128i*)pd);
1812     cache_prefetch ((__m128i*)pm);
1813
1814     while (w && (unsigned long)pd & 15)
1815     {
1816         s = *ps++;
1817         m = *pm++;
1818         d = *pd;
1819
1820         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1821         w--;
1822     }
1823
1824     /* call prefetch hint to optimize cache load*/
1825     cache_prefetch ((__m128i*)ps);
1826     cache_prefetch ((__m128i*)pd);
1827     cache_prefetch ((__m128i*)pm);
1828
1829     while (w >= 4)
1830     {
1831         /* fill cache line with next memory */
1832         cache_prefetch_next ((__m128i*)ps);
1833         cache_prefetch_next ((__m128i*)pd);
1834         cache_prefetch_next ((__m128i*)pm);
1835
1836         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1837         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1838         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1839
1840         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1841         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1842         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1843
1844         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1845                             &xmm_alpha_lo, &xmm_alpha_hi);
1846         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1847                             &xmm_mask_lo, &xmm_mask_hi,
1848                             &xmm_mask_lo, &xmm_mask_hi);
1849
1850         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1851                     &xmm_alpha_lo, &xmm_alpha_hi,
1852                     &xmm_mask_lo, &xmm_mask_hi);
1853
1854         save_128_aligned (
1855             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1856
1857         ps += 4;
1858         pd += 4;
1859         pm += 4;
1860         w -= 4;
1861     }
1862
1863     while (w)
1864     {
1865         s = *ps++;
1866         m = *pm++;
1867         d = *pd;
1868
1869         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1870         w--;
1871     }
1872 }
1873
1874 static force_inline void
1875 core_combine_in_ca_sse2 (uint32_t *      pd,
1876                          const uint32_t *ps,
1877                          const uint32_t *pm,
1878                          int             w)
1879 {
1880     uint32_t s, m, d;
1881
1882     __m128i xmm_alpha_lo, xmm_alpha_hi;
1883     __m128i xmm_src_lo, xmm_src_hi;
1884     __m128i xmm_dst_lo, xmm_dst_hi;
1885     __m128i xmm_mask_lo, xmm_mask_hi;
1886
1887     /* call prefetch hint to optimize cache load*/
1888     cache_prefetch ((__m128i*)ps);
1889     cache_prefetch ((__m128i*)pd);
1890     cache_prefetch ((__m128i*)pm);
1891
1892     while (w && (unsigned long)pd & 15)
1893     {
1894         s = *ps++;
1895         m = *pm++;
1896         d = *pd;
1897
1898         *pd++ = pack_1x64_32 (
1899             pix_multiply_1x64 (
1900                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1901                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1902
1903         w--;
1904     }
1905
1906     /* call prefetch hint to optimize cache load*/
1907     cache_prefetch ((__m128i*)ps);
1908     cache_prefetch ((__m128i*)pd);
1909     cache_prefetch ((__m128i*)pm);
1910
1911     while (w >= 4)
1912     {
1913         /* fill cache line with next memory */
1914         cache_prefetch_next ((__m128i*)ps);
1915         cache_prefetch_next ((__m128i*)pd);
1916         cache_prefetch_next ((__m128i*)pm);
1917
1918         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1919         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1920         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1921
1922         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1923         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1924         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1925
1926         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1927                             &xmm_alpha_lo, &xmm_alpha_hi);
1928
1929         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1930                             &xmm_mask_lo, &xmm_mask_hi,
1931                             &xmm_dst_lo, &xmm_dst_hi);
1932
1933         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1934                             &xmm_alpha_lo, &xmm_alpha_hi,
1935                             &xmm_dst_lo, &xmm_dst_hi);
1936
1937         save_128_aligned (
1938             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1939
1940         ps += 4;
1941         pd += 4;
1942         pm += 4;
1943         w -= 4;
1944     }
1945
1946     while (w)
1947     {
1948         s = *ps++;
1949         m = *pm++;
1950         d = *pd;
1951
1952         *pd++ = pack_1x64_32 (
1953             pix_multiply_1x64 (
1954                 pix_multiply_1x64 (
1955                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1956                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1957
1958         w--;
1959     }
1960 }
1961
1962 static force_inline void
1963 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1964                                  const uint32_t *ps,
1965                                  const uint32_t *pm,
1966                                  int             w)
1967 {
1968     uint32_t s, m, d;
1969
1970     __m128i xmm_alpha_lo, xmm_alpha_hi;
1971     __m128i xmm_src_lo, xmm_src_hi;
1972     __m128i xmm_dst_lo, xmm_dst_hi;
1973     __m128i xmm_mask_lo, xmm_mask_hi;
1974
1975     /* call prefetch hint to optimize cache load*/
1976     cache_prefetch ((__m128i*)ps);
1977     cache_prefetch ((__m128i*)pd);
1978     cache_prefetch ((__m128i*)pm);
1979
1980     while (w && (unsigned long)pd & 15)
1981     {
1982         s = *ps++;
1983         m = *pm++;
1984         d = *pd;
1985
1986         *pd++ = pack_1x64_32 (
1987             pix_multiply_1x64 (
1988                 unpack_32_1x64 (d),
1989                 pix_multiply_1x64 (unpack_32_1x64 (m),
1990                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1991         w--;
1992     }
1993
1994     /* call prefetch hint to optimize cache load*/
1995     cache_prefetch ((__m128i*)ps);
1996     cache_prefetch ((__m128i*)pd);
1997     cache_prefetch ((__m128i*)pm);
1998
1999     while (w >= 4)
2000     {
2001         /* fill cache line with next memory */
2002         cache_prefetch_next ((__m128i*)ps);
2003         cache_prefetch_next ((__m128i*)pd);
2004         cache_prefetch_next ((__m128i*)pm);
2005
2006         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2007         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2008         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2009
2010         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2011         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2012         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2013
2014         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2015                             &xmm_alpha_lo, &xmm_alpha_hi);
2016         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2017                             &xmm_alpha_lo, &xmm_alpha_hi,
2018                             &xmm_alpha_lo, &xmm_alpha_hi);
2019
2020         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2021                             &xmm_alpha_lo, &xmm_alpha_hi,
2022                             &xmm_dst_lo, &xmm_dst_hi);
2023
2024         save_128_aligned (
2025             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2026
2027         ps += 4;
2028         pd += 4;
2029         pm += 4;
2030         w -= 4;
2031     }
2032
2033     while (w)
2034     {
2035         s = *ps++;
2036         m = *pm++;
2037         d = *pd;
2038
2039         *pd++ = pack_1x64_32 (
2040             pix_multiply_1x64 (
2041                 unpack_32_1x64 (d),
2042                 pix_multiply_1x64 (unpack_32_1x64 (m),
2043                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
2044         w--;
2045     }
2046 }
2047
2048 static force_inline void
2049 core_combine_out_ca_sse2 (uint32_t *      pd,
2050                           const uint32_t *ps,
2051                           const uint32_t *pm,
2052                           int             w)
2053 {
2054     uint32_t s, m, d;
2055
2056     __m128i xmm_alpha_lo, xmm_alpha_hi;
2057     __m128i xmm_src_lo, xmm_src_hi;
2058     __m128i xmm_dst_lo, xmm_dst_hi;
2059     __m128i xmm_mask_lo, xmm_mask_hi;
2060
2061     /* call prefetch hint to optimize cache load*/
2062     cache_prefetch ((__m128i*)ps);
2063     cache_prefetch ((__m128i*)pd);
2064     cache_prefetch ((__m128i*)pm);
2065
2066     while (w && (unsigned long)pd & 15)
2067     {
2068         s = *ps++;
2069         m = *pm++;
2070         d = *pd;
2071
2072         *pd++ = pack_1x64_32 (
2073             pix_multiply_1x64 (
2074                 pix_multiply_1x64 (
2075                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2076                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2077         w--;
2078     }
2079
2080     /* call prefetch hint to optimize cache load*/
2081     cache_prefetch ((__m128i*)ps);
2082     cache_prefetch ((__m128i*)pd);
2083     cache_prefetch ((__m128i*)pm);
2084
2085     while (w >= 4)
2086     {
2087         /* fill cache line with next memory */
2088         cache_prefetch_next ((__m128i*)ps);
2089         cache_prefetch_next ((__m128i*)pd);
2090         cache_prefetch_next ((__m128i*)pm);
2091
2092         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2093         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2094         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2095
2096         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2097         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2098         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2099
2100         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2101                             &xmm_alpha_lo, &xmm_alpha_hi);
2102         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2103                       &xmm_alpha_lo, &xmm_alpha_hi);
2104
2105         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2106                             &xmm_mask_lo, &xmm_mask_hi,
2107                             &xmm_dst_lo, &xmm_dst_hi);
2108         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2109                             &xmm_alpha_lo, &xmm_alpha_hi,
2110                             &xmm_dst_lo, &xmm_dst_hi);
2111
2112         save_128_aligned (
2113             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2114
2115         ps += 4;
2116         pd += 4;
2117         pm += 4;
2118         w -= 4;
2119     }
2120
2121     while (w)
2122     {
2123         s = *ps++;
2124         m = *pm++;
2125         d = *pd;
2126
2127         *pd++ = pack_1x64_32 (
2128             pix_multiply_1x64 (
2129                 pix_multiply_1x64 (
2130                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2131                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2132
2133         w--;
2134     }
2135 }
2136
2137 static force_inline void
2138 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
2139                                   const uint32_t *ps,
2140                                   const uint32_t *pm,
2141                                   int             w)
2142 {
2143     uint32_t s, m, d;
2144
2145     __m128i xmm_alpha_lo, xmm_alpha_hi;
2146     __m128i xmm_src_lo, xmm_src_hi;
2147     __m128i xmm_dst_lo, xmm_dst_hi;
2148     __m128i xmm_mask_lo, xmm_mask_hi;
2149
2150     /* call prefetch hint to optimize cache load*/
2151     cache_prefetch ((__m128i*)ps);
2152     cache_prefetch ((__m128i*)pd);
2153     cache_prefetch ((__m128i*)pm);
2154
2155     while (w && (unsigned long)pd & 15)
2156     {
2157         s = *ps++;
2158         m = *pm++;
2159         d = *pd;
2160
2161         *pd++ = pack_1x64_32 (
2162             pix_multiply_1x64 (
2163                 unpack_32_1x64 (d),
2164                 negate_1x64 (pix_multiply_1x64 (
2165                                  unpack_32_1x64 (m),
2166                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2167         w--;
2168     }
2169
2170     /* call prefetch hint to optimize cache load*/
2171     cache_prefetch ((__m128i*)ps);
2172     cache_prefetch ((__m128i*)pd);
2173     cache_prefetch ((__m128i*)pm);
2174
2175     while (w >= 4)
2176     {
2177         /* fill cache line with next memory */
2178         cache_prefetch_next ((__m128i*)ps);
2179         cache_prefetch_next ((__m128i*)pd);
2180         cache_prefetch_next ((__m128i*)pm);
2181
2182         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2183         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2184         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2185
2186         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2187         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2188         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2189
2190         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2191                             &xmm_alpha_lo, &xmm_alpha_hi);
2192
2193         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2194                             &xmm_alpha_lo, &xmm_alpha_hi,
2195                             &xmm_mask_lo, &xmm_mask_hi);
2196
2197         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2198                       &xmm_mask_lo, &xmm_mask_hi);
2199
2200         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2201                             &xmm_mask_lo, &xmm_mask_hi,
2202                             &xmm_dst_lo, &xmm_dst_hi);
2203
2204         save_128_aligned (
2205             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2206
2207         ps += 4;
2208         pd += 4;
2209         pm += 4;
2210         w -= 4;
2211     }
2212
2213     while (w)
2214     {
2215         s = *ps++;
2216         m = *pm++;
2217         d = *pd;
2218
2219         *pd++ = pack_1x64_32 (
2220             pix_multiply_1x64 (
2221                 unpack_32_1x64 (d),
2222                 negate_1x64 (pix_multiply_1x64 (
2223                                  unpack_32_1x64 (m),
2224                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2225         w--;
2226     }
2227 }
2228
2229 static force_inline uint32_t
2230 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2231                                  uint32_t mask,
2232                                  uint32_t dst)
2233 {
2234     __m64 m = unpack_32_1x64 (mask);
2235     __m64 s = unpack_32_1x64 (src);
2236     __m64 d = unpack_32_1x64 (dst);
2237     __m64 sa = expand_alpha_1x64 (s);
2238     __m64 da = expand_alpha_1x64 (d);
2239
2240     s = pix_multiply_1x64 (s, m);
2241     m = negate_1x64 (pix_multiply_1x64 (m, sa));
2242
2243     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2244 }
2245
2246 static force_inline void
2247 core_combine_atop_ca_sse2 (uint32_t *      pd,
2248                            const uint32_t *ps,
2249                            const uint32_t *pm,
2250                            int             w)
2251 {
2252     uint32_t s, m, d;
2253
2254     __m128i xmm_src_lo, xmm_src_hi;
2255     __m128i xmm_dst_lo, xmm_dst_hi;
2256     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2257     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2258     __m128i xmm_mask_lo, xmm_mask_hi;
2259
2260     /* call prefetch hint to optimize cache load*/
2261     cache_prefetch ((__m128i*)ps);
2262     cache_prefetch ((__m128i*)pd);
2263     cache_prefetch ((__m128i*)pm);
2264
2265     while (w && (unsigned long)pd & 15)
2266     {
2267         s = *ps++;
2268         m = *pm++;
2269         d = *pd;
2270
2271         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2272         w--;
2273     }
2274
2275     /* call prefetch hint to optimize cache load*/
2276     cache_prefetch ((__m128i*)ps);
2277     cache_prefetch ((__m128i*)pd);
2278     cache_prefetch ((__m128i*)pm);
2279
2280     while (w >= 4)
2281     {
2282         /* fill cache line with next memory */
2283         cache_prefetch_next ((__m128i*)ps);
2284         cache_prefetch_next ((__m128i*)pd);
2285         cache_prefetch_next ((__m128i*)pm);
2286
2287         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2288         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2289         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2290
2291         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2292         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2293         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2294
2295         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2296                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2297         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2298                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2299
2300         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2301                             &xmm_mask_lo, &xmm_mask_hi,
2302                             &xmm_src_lo, &xmm_src_hi);
2303         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2304                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2305                             &xmm_mask_lo, &xmm_mask_hi);
2306
2307         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2308
2309         pix_add_multiply_2x128 (
2310             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2311             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2312             &xmm_dst_lo, &xmm_dst_hi);
2313
2314         save_128_aligned (
2315             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2316
2317         ps += 4;
2318         pd += 4;
2319         pm += 4;
2320         w -= 4;
2321     }
2322
2323     while (w)
2324     {
2325         s = *ps++;
2326         m = *pm++;
2327         d = *pd;
2328
2329         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2330         w--;
2331     }
2332 }
2333
2334 static force_inline uint32_t
2335 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2336                                          uint32_t mask,
2337                                          uint32_t dst)
2338 {
2339     __m64 m = unpack_32_1x64 (mask);
2340     __m64 s = unpack_32_1x64 (src);
2341     __m64 d = unpack_32_1x64 (dst);
2342
2343     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2344     __m64 sa = expand_alpha_1x64 (s);
2345
2346     s = pix_multiply_1x64 (s, m);
2347     m = pix_multiply_1x64 (m, sa);
2348
2349     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2350 }
2351
2352 static force_inline void
2353 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2354                                    const uint32_t *ps,
2355                                    const uint32_t *pm,
2356                                    int             w)
2357 {
2358     uint32_t s, m, d;
2359
2360     __m128i xmm_src_lo, xmm_src_hi;
2361     __m128i xmm_dst_lo, xmm_dst_hi;
2362     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2363     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2364     __m128i xmm_mask_lo, xmm_mask_hi;
2365
2366     /* call prefetch hint to optimize cache load*/
2367     cache_prefetch ((__m128i*)ps);
2368     cache_prefetch ((__m128i*)pd);
2369     cache_prefetch ((__m128i*)pm);
2370
2371     while (w && (unsigned long)pd & 15)
2372     {
2373         s = *ps++;
2374         m = *pm++;
2375         d = *pd;
2376
2377         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2378         w--;
2379     }
2380
2381     /* call prefetch hint to optimize cache load*/
2382     cache_prefetch ((__m128i*)ps);
2383     cache_prefetch ((__m128i*)pd);
2384     cache_prefetch ((__m128i*)pm);
2385
2386     while (w >= 4)
2387     {
2388         /* fill cache line with next memory */
2389         cache_prefetch_next ((__m128i*)ps);
2390         cache_prefetch_next ((__m128i*)pd);
2391         cache_prefetch_next ((__m128i*)pm);
2392
2393         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2394         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2395         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2396
2397         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2398         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2399         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2400
2401         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2402                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2403         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2404                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2405
2406         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2407                             &xmm_mask_lo, &xmm_mask_hi,
2408                             &xmm_src_lo, &xmm_src_hi);
2409         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2410                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2411                             &xmm_mask_lo, &xmm_mask_hi);
2412
2413         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2414                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2415
2416         pix_add_multiply_2x128 (
2417             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2418             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2419             &xmm_dst_lo, &xmm_dst_hi);
2420
2421         save_128_aligned (
2422             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2423
2424         ps += 4;
2425         pd += 4;
2426         pm += 4;
2427         w -= 4;
2428     }
2429
2430     while (w)
2431     {
2432         s = *ps++;
2433         m = *pm++;
2434         d = *pd;
2435
2436         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2437         w--;
2438     }
2439 }
2440
2441 static force_inline uint32_t
2442 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2443                                 uint32_t mask,
2444                                 uint32_t dst)
2445 {
2446     __m64 a = unpack_32_1x64 (mask);
2447     __m64 s = unpack_32_1x64 (src);
2448     __m64 d = unpack_32_1x64 (dst);
2449
2450     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2451                                        a, expand_alpha_1x64 (s)));
2452     __m64 dest      = pix_multiply_1x64 (s, a);
2453     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2454
2455     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2456                                                 &alpha_dst,
2457                                                 &dest,
2458                                                 &alpha_src));
2459 }
2460
2461 static force_inline void
2462 core_combine_xor_ca_sse2 (uint32_t *      pd,
2463                           const uint32_t *ps,
2464                           const uint32_t *pm,
2465                           int             w)
2466 {
2467     uint32_t s, m, d;
2468
2469     __m128i xmm_src_lo, xmm_src_hi;
2470     __m128i xmm_dst_lo, xmm_dst_hi;
2471     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2472     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2473     __m128i xmm_mask_lo, xmm_mask_hi;
2474
2475     /* call prefetch hint to optimize cache load*/
2476     cache_prefetch ((__m128i*)ps);
2477     cache_prefetch ((__m128i*)pd);
2478     cache_prefetch ((__m128i*)pm);
2479
2480     while (w && (unsigned long)pd & 15)
2481     {
2482         s = *ps++;
2483         m = *pm++;
2484         d = *pd;
2485
2486         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2487         w--;
2488     }
2489
2490     /* call prefetch hint to optimize cache load*/
2491     cache_prefetch ((__m128i*)ps);
2492     cache_prefetch ((__m128i*)pd);
2493     cache_prefetch ((__m128i*)pm);
2494
2495     while (w >= 4)
2496     {
2497         /* fill cache line with next memory */
2498         cache_prefetch_next ((__m128i*)ps);
2499         cache_prefetch_next ((__m128i*)pd);
2500         cache_prefetch_next ((__m128i*)pm);
2501
2502         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2503         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2504         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2505
2506         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2507         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2508         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2509
2510         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2511                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2512         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2513                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2514
2515         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2516                             &xmm_mask_lo, &xmm_mask_hi,
2517                             &xmm_src_lo, &xmm_src_hi);
2518         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2519                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2520                             &xmm_mask_lo, &xmm_mask_hi);
2521
2522         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2523                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2524         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2525                       &xmm_mask_lo, &xmm_mask_hi);
2526
2527         pix_add_multiply_2x128 (
2528             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2529             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2530             &xmm_dst_lo, &xmm_dst_hi);
2531
2532         save_128_aligned (
2533             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2534
2535         ps += 4;
2536         pd += 4;
2537         pm += 4;
2538         w -= 4;
2539     }
2540
2541     while (w)
2542     {
2543         s = *ps++;
2544         m = *pm++;
2545         d = *pd;
2546
2547         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2548         w--;
2549     }
2550 }
2551
2552 static force_inline void
2553 core_combine_add_ca_sse2 (uint32_t *      pd,
2554                           const uint32_t *ps,
2555                           const uint32_t *pm,
2556                           int             w)
2557 {
2558     uint32_t s, m, d;
2559
2560     __m128i xmm_src_lo, xmm_src_hi;
2561     __m128i xmm_dst_lo, xmm_dst_hi;
2562     __m128i xmm_mask_lo, xmm_mask_hi;
2563
2564     /* call prefetch hint to optimize cache load*/
2565     cache_prefetch ((__m128i*)ps);
2566     cache_prefetch ((__m128i*)pd);
2567     cache_prefetch ((__m128i*)pm);
2568
2569     while (w && (unsigned long)pd & 15)
2570     {
2571         s = *ps++;
2572         m = *pm++;
2573         d = *pd;
2574
2575         *pd++ = pack_1x64_32 (
2576             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2577                                              unpack_32_1x64 (m)),
2578                           unpack_32_1x64 (d)));
2579         w--;
2580     }
2581
2582     /* call prefetch hint to optimize cache load*/
2583     cache_prefetch ((__m128i*)ps);
2584     cache_prefetch ((__m128i*)pd);
2585     cache_prefetch ((__m128i*)pm);
2586
2587     while (w >= 4)
2588     {
2589         /* fill cache line with next memory */
2590         cache_prefetch_next ((__m128i*)ps);
2591         cache_prefetch_next ((__m128i*)pd);
2592         cache_prefetch_next ((__m128i*)pm);
2593
2594         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2595         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2596         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2597
2598         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2599         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2600         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2601
2602         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2603                             &xmm_mask_lo, &xmm_mask_hi,
2604                             &xmm_src_lo, &xmm_src_hi);
2605
2606         save_128_aligned (
2607             (__m128i*)pd, pack_2x128_128 (
2608                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2609                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2610
2611         ps += 4;
2612         pd += 4;
2613         pm += 4;
2614         w -= 4;
2615     }
2616
2617     while (w)
2618     {
2619         s = *ps++;
2620         m = *pm++;
2621         d = *pd;
2622
2623         *pd++ = pack_1x64_32 (
2624             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2625                                              unpack_32_1x64 (m)),
2626                           unpack_32_1x64 (d)));
2627         w--;
2628     }
2629 }
2630
2631 /* ---------------------------------------------------
2632  * fb_compose_setup_sSE2
2633  */
2634 static force_inline __m64
2635 create_mask_16_64 (uint16_t mask)
2636 {
2637     return _mm_set1_pi16 (mask);
2638 }
2639
2640 static force_inline __m128i
2641 create_mask_16_128 (uint16_t mask)
2642 {
2643     return _mm_set1_epi16 (mask);
2644 }
2645
2646 static force_inline __m64
2647 create_mask_2x32_64 (uint32_t mask0,
2648                      uint32_t mask1)
2649 {
2650     return _mm_set_pi32 (mask0, mask1);
2651 }
2652
2653 /* Work around a code generation bug in Sun Studio 12. */
2654 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2655 # define create_mask_2x32_128(mask0, mask1)                             \
2656     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2657 #else
2658 static force_inline __m128i
2659 create_mask_2x32_128 (uint32_t mask0,
2660                       uint32_t mask1)
2661 {
2662     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2663 }
2664 #endif
2665
2666 /* SSE2 code patch for fbcompose.c */
2667
2668 static void
2669 sse2_combine_over_u (pixman_implementation_t *imp,
2670                      pixman_op_t              op,
2671                      uint32_t *               dst,
2672                      const uint32_t *         src,
2673                      const uint32_t *         mask,
2674                      int                      width)
2675 {
2676     core_combine_over_u_sse2 (dst, src, mask, width);
2677     _mm_empty ();
2678 }
2679
2680 static void
2681 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2682                              pixman_op_t              op,
2683                              uint32_t *               dst,
2684                              const uint32_t *         src,
2685                              const uint32_t *         mask,
2686                              int                      width)
2687 {
2688     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2689     _mm_empty ();
2690 }
2691
2692 static void
2693 sse2_combine_in_u (pixman_implementation_t *imp,
2694                    pixman_op_t              op,
2695                    uint32_t *               dst,
2696                    const uint32_t *         src,
2697                    const uint32_t *         mask,
2698                    int                      width)
2699 {
2700     core_combine_in_u_sse2 (dst, src, mask, width);
2701     _mm_empty ();
2702 }
2703
2704 static void
2705 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2706                            pixman_op_t              op,
2707                            uint32_t *               dst,
2708                            const uint32_t *         src,
2709                            const uint32_t *         mask,
2710                            int                      width)
2711 {
2712     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2713     _mm_empty ();
2714 }
2715
2716 static void
2717 sse2_combine_out_u (pixman_implementation_t *imp,
2718                     pixman_op_t              op,
2719                     uint32_t *               dst,
2720                     const uint32_t *         src,
2721                     const uint32_t *         mask,
2722                     int                      width)
2723 {
2724     core_combine_out_u_sse2 (dst, src, mask, width);
2725     _mm_empty ();
2726 }
2727
2728 static void
2729 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2730                             pixman_op_t              op,
2731                             uint32_t *               dst,
2732                             const uint32_t *         src,
2733                             const uint32_t *         mask,
2734                             int                      width)
2735 {
2736     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2737     _mm_empty ();
2738 }
2739
2740 static void
2741 sse2_combine_atop_u (pixman_implementation_t *imp,
2742                      pixman_op_t              op,
2743                      uint32_t *               dst,
2744                      const uint32_t *         src,
2745                      const uint32_t *         mask,
2746                      int                      width)
2747 {
2748     core_combine_atop_u_sse2 (dst, src, mask, width);
2749     _mm_empty ();
2750 }
2751
2752 static void
2753 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2754                              pixman_op_t              op,
2755                              uint32_t *               dst,
2756                              const uint32_t *         src,
2757                              const uint32_t *         mask,
2758                              int                      width)
2759 {
2760     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2761     _mm_empty ();
2762 }
2763
2764 static void
2765 sse2_combine_xor_u (pixman_implementation_t *imp,
2766                     pixman_op_t              op,
2767                     uint32_t *               dst,
2768                     const uint32_t *         src,
2769                     const uint32_t *         mask,
2770                     int                      width)
2771 {
2772     core_combine_xor_u_sse2 (dst, src, mask, width);
2773     _mm_empty ();
2774 }
2775
2776 static void
2777 sse2_combine_add_u (pixman_implementation_t *imp,
2778                     pixman_op_t              op,
2779                     uint32_t *               dst,
2780                     const uint32_t *         src,
2781                     const uint32_t *         mask,
2782                     int                      width)
2783 {
2784     core_combine_add_u_sse2 (dst, src, mask, width);
2785     _mm_empty ();
2786 }
2787
2788 static void
2789 sse2_combine_saturate_u (pixman_implementation_t *imp,
2790                          pixman_op_t              op,
2791                          uint32_t *               dst,
2792                          const uint32_t *         src,
2793                          const uint32_t *         mask,
2794                          int                      width)
2795 {
2796     core_combine_saturate_u_sse2 (dst, src, mask, width);
2797     _mm_empty ();
2798 }
2799
2800 static void
2801 sse2_combine_src_ca (pixman_implementation_t *imp,
2802                      pixman_op_t              op,
2803                      uint32_t *               dst,
2804                      const uint32_t *         src,
2805                      const uint32_t *         mask,
2806                      int                      width)
2807 {
2808     core_combine_src_ca_sse2 (dst, src, mask, width);
2809     _mm_empty ();
2810 }
2811
2812 static void
2813 sse2_combine_over_ca (pixman_implementation_t *imp,
2814                       pixman_op_t              op,
2815                       uint32_t *               dst,
2816                       const uint32_t *         src,
2817                       const uint32_t *         mask,
2818                       int                      width)
2819 {
2820     core_combine_over_ca_sse2 (dst, src, mask, width);
2821     _mm_empty ();
2822 }
2823
2824 static void
2825 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2826                               pixman_op_t              op,
2827                               uint32_t *               dst,
2828                               const uint32_t *         src,
2829                               const uint32_t *         mask,
2830                               int                      width)
2831 {
2832     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2833     _mm_empty ();
2834 }
2835
2836 static void
2837 sse2_combine_in_ca (pixman_implementation_t *imp,
2838                     pixman_op_t              op,
2839                     uint32_t *               dst,
2840                     const uint32_t *         src,
2841                     const uint32_t *         mask,
2842                     int                      width)
2843 {
2844     core_combine_in_ca_sse2 (dst, src, mask, width);
2845     _mm_empty ();
2846 }
2847
2848 static void
2849 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2850                             pixman_op_t              op,
2851                             uint32_t *               dst,
2852                             const uint32_t *         src,
2853                             const uint32_t *         mask,
2854                             int                      width)
2855 {
2856     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2857     _mm_empty ();
2858 }
2859
2860 static void
2861 sse2_combine_out_ca (pixman_implementation_t *imp,
2862                      pixman_op_t              op,
2863                      uint32_t *               dst,
2864                      const uint32_t *         src,
2865                      const uint32_t *         mask,
2866                      int                      width)
2867 {
2868     core_combine_out_ca_sse2 (dst, src, mask, width);
2869     _mm_empty ();
2870 }
2871
2872 static void
2873 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2874                              pixman_op_t              op,
2875                              uint32_t *               dst,
2876                              const uint32_t *         src,
2877                              const uint32_t *         mask,
2878                              int                      width)
2879 {
2880     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2881     _mm_empty ();
2882 }
2883
2884 static void
2885 sse2_combine_atop_ca (pixman_implementation_t *imp,
2886                       pixman_op_t              op,
2887                       uint32_t *               dst,
2888                       const uint32_t *         src,
2889                       const uint32_t *         mask,
2890                       int                      width)
2891 {
2892     core_combine_atop_ca_sse2 (dst, src, mask, width);
2893     _mm_empty ();
2894 }
2895
2896 static void
2897 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2898                               pixman_op_t              op,
2899                               uint32_t *               dst,
2900                               const uint32_t *         src,
2901                               const uint32_t *         mask,
2902                               int                      width)
2903 {
2904     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2905     _mm_empty ();
2906 }
2907
2908 static void
2909 sse2_combine_xor_ca (pixman_implementation_t *imp,
2910                      pixman_op_t              op,
2911                      uint32_t *               dst,
2912                      const uint32_t *         src,
2913                      const uint32_t *         mask,
2914                      int                      width)
2915 {
2916     core_combine_xor_ca_sse2 (dst, src, mask, width);
2917     _mm_empty ();
2918 }
2919
2920 static void
2921 sse2_combine_add_ca (pixman_implementation_t *imp,
2922                      pixman_op_t              op,
2923                      uint32_t *               dst,
2924                      const uint32_t *         src,
2925                      const uint32_t *         mask,
2926                      int                      width)
2927 {
2928     core_combine_add_ca_sse2 (dst, src, mask, width);
2929     _mm_empty ();
2930 }
2931
2932 /* -------------------------------------------------------------------
2933  * composite_over_n_8888
2934  */
2935
2936 static void
2937 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2938                             pixman_op_t              op,
2939                             pixman_image_t *         src_image,
2940                             pixman_image_t *         mask_image,
2941                             pixman_image_t *         dst_image,
2942                             int32_t                  src_x,
2943                             int32_t                  src_y,
2944                             int32_t                  mask_x,
2945                             int32_t                  mask_y,
2946                             int32_t                  dest_x,
2947                             int32_t                  dest_y,
2948                             int32_t                  width,
2949                             int32_t                  height)
2950 {
2951     uint32_t src;
2952     uint32_t    *dst_line, *dst, d;
2953     int32_t w;
2954     int dst_stride;
2955     __m128i xmm_src, xmm_alpha;
2956     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2957
2958     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2959
2960     if (src == 0)
2961         return;
2962
2963     PIXMAN_IMAGE_GET_LINE (
2964         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2965
2966     xmm_src = expand_pixel_32_1x128 (src);
2967     xmm_alpha = expand_alpha_1x128 (xmm_src);
2968
2969     while (height--)
2970     {
2971         dst = dst_line;
2972
2973         /* call prefetch hint to optimize cache load*/
2974         cache_prefetch ((__m128i*)dst);
2975
2976         dst_line += dst_stride;
2977         w = width;
2978
2979         while (w && (unsigned long)dst & 15)
2980         {
2981             d = *dst;
2982             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2983                                               _mm_movepi64_pi64 (xmm_alpha),
2984                                               unpack_32_1x64 (d)));
2985             w--;
2986         }
2987
2988         cache_prefetch ((__m128i*)dst);
2989
2990         while (w >= 4)
2991         {
2992             /* fill cache line with next memory */
2993             cache_prefetch_next ((__m128i*)dst);
2994
2995             xmm_dst = load_128_aligned ((__m128i*)dst);
2996
2997             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2998
2999             over_2x128 (&xmm_src, &xmm_src,
3000                         &xmm_alpha, &xmm_alpha,
3001                         &xmm_dst_lo, &xmm_dst_hi);
3002
3003             /* rebuid the 4 pixel data and save*/
3004             save_128_aligned (
3005                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3006
3007             w -= 4;
3008             dst += 4;
3009         }
3010
3011         while (w)
3012         {
3013             d = *dst;
3014             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3015                                               _mm_movepi64_pi64 (xmm_alpha),
3016                                               unpack_32_1x64 (d)));
3017             w--;
3018         }
3019
3020     }
3021     _mm_empty ();
3022 }
3023
3024 /* ---------------------------------------------------------------------
3025  * composite_over_n_0565
3026  */
3027 static void
3028 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3029                             pixman_op_t              op,
3030                             pixman_image_t *         src_image,
3031                             pixman_image_t *         mask_image,
3032                             pixman_image_t *         dst_image,
3033                             int32_t                  src_x,
3034                             int32_t                  src_y,
3035                             int32_t                  mask_x,
3036                             int32_t                  mask_y,
3037                             int32_t                  dest_x,
3038                             int32_t                  dest_y,
3039                             int32_t                  width,
3040                             int32_t                  height)
3041 {
3042     uint32_t src;
3043     uint16_t    *dst_line, *dst, d;
3044     int32_t w;
3045     int dst_stride;
3046     __m128i xmm_src, xmm_alpha;
3047     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3048
3049     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3050
3051     if (src == 0)
3052         return;
3053
3054     PIXMAN_IMAGE_GET_LINE (
3055         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3056
3057     xmm_src = expand_pixel_32_1x128 (src);
3058     xmm_alpha = expand_alpha_1x128 (xmm_src);
3059
3060     while (height--)
3061     {
3062         dst = dst_line;
3063
3064         /* call prefetch hint to optimize cache load*/
3065         cache_prefetch ((__m128i*)dst);
3066
3067         dst_line += dst_stride;
3068         w = width;
3069
3070         while (w && (unsigned long)dst & 15)
3071         {
3072             d = *dst;
3073
3074             *dst++ = pack_565_32_16 (
3075                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3076                                          _mm_movepi64_pi64 (xmm_alpha),
3077                                          expand565_16_1x64 (d))));
3078             w--;
3079         }
3080
3081         /* call prefetch hint to optimize cache load*/
3082         cache_prefetch ((__m128i*)dst);
3083
3084         while (w >= 8)
3085         {
3086             /* fill cache line with next memory */
3087             cache_prefetch_next ((__m128i*)dst);
3088
3089             xmm_dst = load_128_aligned ((__m128i*)dst);
3090
3091             unpack_565_128_4x128 (xmm_dst,
3092                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3093
3094             over_2x128 (&xmm_src, &xmm_src,
3095                         &xmm_alpha, &xmm_alpha,
3096                         &xmm_dst0, &xmm_dst1);
3097             over_2x128 (&xmm_src, &xmm_src,
3098                         &xmm_alpha, &xmm_alpha,
3099                         &xmm_dst2, &xmm_dst3);
3100
3101             xmm_dst = pack_565_4x128_128 (
3102                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3103
3104             save_128_aligned ((__m128i*)dst, xmm_dst);
3105
3106             dst += 8;
3107             w -= 8;
3108         }
3109
3110         while (w--)
3111         {
3112             d = *dst;
3113             *dst++ = pack_565_32_16 (
3114                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3115                                          _mm_movepi64_pi64 (xmm_alpha),
3116                                          expand565_16_1x64 (d))));
3117         }
3118     }
3119
3120     _mm_empty ();
3121 }
3122
3123 /* ------------------------------
3124  * composite_add_n_8888_8888_ca
3125  */
3126 static void
3127 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3128                                    pixman_op_t              op,
3129                                    pixman_image_t *         src_image,
3130                                    pixman_image_t *         mask_image,
3131                                    pixman_image_t *         dst_image,
3132                                    int32_t                  src_x,
3133                                    int32_t                  src_y,
3134                                    int32_t                  mask_x,
3135                                    int32_t                  mask_y,
3136                                    int32_t                  dest_x,
3137                                    int32_t                  dest_y,
3138                                    int32_t                  width,
3139                                    int32_t                  height)
3140 {
3141     uint32_t src, srca;
3142     uint32_t    *dst_line, d;
3143     uint32_t    *mask_line, m;
3144     uint32_t pack_cmp;
3145     int dst_stride, mask_stride;
3146
3147     __m128i xmm_src, xmm_alpha;
3148     __m128i xmm_dst;
3149     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3150
3151     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3152
3153     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3154     srca = src >> 24;
3155
3156     if (src == 0)
3157         return;
3158
3159     PIXMAN_IMAGE_GET_LINE (
3160         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3161     PIXMAN_IMAGE_GET_LINE (
3162         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3163
3164     xmm_src = _mm_unpacklo_epi8 (
3165         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3166     xmm_alpha = expand_alpha_1x128 (xmm_src);
3167     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3168     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3169
3170     while (height--)
3171     {
3172         int w = width;
3173         const uint32_t *pm = (uint32_t *)mask_line;
3174         uint32_t *pd = (uint32_t *)dst_line;
3175
3176         dst_line += dst_stride;
3177         mask_line += mask_stride;
3178
3179         /* call prefetch hint to optimize cache load*/
3180         cache_prefetch ((__m128i*)pd);
3181         cache_prefetch ((__m128i*)pm);
3182
3183         while (w && (unsigned long)pd & 15)
3184         {
3185             m = *pm++;
3186
3187             if (m)
3188             {
3189                 d = *pd;
3190
3191                 mmx_mask = unpack_32_1x64 (m);
3192                 mmx_dest = unpack_32_1x64 (d);
3193
3194                 *pd = pack_1x64_32 (
3195                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3196             }
3197
3198             pd++;
3199             w--;
3200         }
3201
3202         /* call prefetch hint to optimize cache load*/
3203         cache_prefetch ((__m128i*)pd);
3204         cache_prefetch ((__m128i*)pm);
3205
3206         while (w >= 4)
3207         {
3208             /* fill cache line with next memory */
3209             cache_prefetch_next ((__m128i*)pd);
3210             cache_prefetch_next ((__m128i*)pm);
3211
3212             xmm_mask = load_128_unaligned ((__m128i*)pm);
3213
3214             pack_cmp =
3215                 _mm_movemask_epi8 (
3216                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3217
3218             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3219             if (pack_cmp != 0xffff)
3220             {
3221                 xmm_dst = load_128_aligned ((__m128i*)pd);
3222
3223                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3224
3225                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3226                                     &xmm_mask_lo, &xmm_mask_hi,
3227                                     &xmm_mask_lo, &xmm_mask_hi);
3228                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3229
3230                 save_128_aligned (
3231                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3232             }
3233
3234             pd += 4;
3235             pm += 4;
3236             w -= 4;
3237         }
3238
3239         while (w)
3240         {
3241             m = *pm++;
3242
3243             if (m)
3244             {
3245                 d = *pd;
3246
3247                 mmx_mask = unpack_32_1x64 (m);
3248                 mmx_dest = unpack_32_1x64 (d);
3249
3250                 *pd = pack_1x64_32 (
3251                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3252             }
3253
3254             pd++;
3255             w--;
3256         }
3257     }
3258
3259     _mm_empty ();
3260 }
3261
3262 /* ---------------------------------------------------------------------------
3263  * composite_over_n_8888_8888_ca
3264  */
3265
3266 static void
3267 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3268                                     pixman_op_t              op,
3269                                     pixman_image_t *         src_image,
3270                                     pixman_image_t *         mask_image,
3271                                     pixman_image_t *         dst_image,
3272                                     int32_t                  src_x,
3273                                     int32_t                  src_y,
3274                                     int32_t                  mask_x,
3275                                     int32_t                  mask_y,
3276                                     int32_t                  dest_x,
3277                                     int32_t                  dest_y,
3278                                     int32_t                  width,
3279                                     int32_t                  height)
3280 {
3281     uint32_t src;
3282     uint32_t    *dst_line, d;
3283     uint32_t    *mask_line, m;
3284     uint32_t pack_cmp;
3285     int dst_stride, mask_stride;
3286
3287     __m128i xmm_src, xmm_alpha;
3288     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3289     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3290
3291     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3292
3293     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3294
3295     if (src == 0)
3296         return;
3297
3298     PIXMAN_IMAGE_GET_LINE (
3299         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3300     PIXMAN_IMAGE_GET_LINE (
3301         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3302
3303     xmm_src = _mm_unpacklo_epi8 (
3304         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3305     xmm_alpha = expand_alpha_1x128 (xmm_src);
3306     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3307     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3308
3309     while (height--)
3310     {
3311         int w = width;
3312         const uint32_t *pm = (uint32_t *)mask_line;
3313         uint32_t *pd = (uint32_t *)dst_line;
3314
3315         dst_line += dst_stride;
3316         mask_line += mask_stride;
3317
3318         /* call prefetch hint to optimize cache load*/
3319         cache_prefetch ((__m128i*)pd);
3320         cache_prefetch ((__m128i*)pm);
3321
3322         while (w && (unsigned long)pd & 15)
3323         {
3324             m = *pm++;
3325
3326             if (m)
3327             {
3328                 d = *pd;
3329                 mmx_mask = unpack_32_1x64 (m);
3330                 mmx_dest = unpack_32_1x64 (d);
3331
3332                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3333                                                   &mmx_alpha,
3334                                                   &mmx_mask,
3335                                                   &mmx_dest));
3336             }
3337
3338             pd++;
3339             w--;
3340         }
3341
3342         /* call prefetch hint to optimize cache load*/
3343         cache_prefetch ((__m128i*)pd);
3344         cache_prefetch ((__m128i*)pm);
3345
3346         while (w >= 4)
3347         {
3348             /* fill cache line with next memory */
3349             cache_prefetch_next ((__m128i*)pd);
3350             cache_prefetch_next ((__m128i*)pm);
3351
3352             xmm_mask = load_128_unaligned ((__m128i*)pm);
3353
3354             pack_cmp =
3355                 _mm_movemask_epi8 (
3356                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3357
3358             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3359             if (pack_cmp != 0xffff)
3360             {
3361                 xmm_dst = load_128_aligned ((__m128i*)pd);
3362
3363                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3364                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3365
3366                 in_over_2x128 (&xmm_src, &xmm_src,
3367                                &xmm_alpha, &xmm_alpha,
3368                                &xmm_mask_lo, &xmm_mask_hi,
3369                                &xmm_dst_lo, &xmm_dst_hi);
3370
3371                 save_128_aligned (
3372                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3373             }
3374
3375             pd += 4;
3376             pm += 4;
3377             w -= 4;
3378         }
3379
3380         while (w)
3381         {
3382             m = *pm++;
3383
3384             if (m)
3385             {
3386                 d = *pd;
3387                 mmx_mask = unpack_32_1x64 (m);
3388                 mmx_dest = unpack_32_1x64 (d);
3389
3390                 *pd = pack_1x64_32 (
3391                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3392             }
3393
3394             pd++;
3395             w--;
3396         }
3397     }
3398
3399     _mm_empty ();
3400 }
3401
3402 /*---------------------------------------------------------------------
3403  * composite_over_8888_n_8888
3404  */
3405
3406 static void
3407 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3408                                  pixman_op_t              op,
3409                                  pixman_image_t *         src_image,
3410                                  pixman_image_t *         mask_image,
3411                                  pixman_image_t *         dst_image,
3412                                  int32_t                  src_x,
3413                                  int32_t                  src_y,
3414                                  int32_t                  mask_x,
3415                                  int32_t                  mask_y,
3416                                  int32_t                  dest_x,
3417                                  int32_t                  dest_y,
3418                                  int32_t                  width,
3419                                  int32_t                  height)
3420 {
3421     uint32_t    *dst_line, *dst;
3422     uint32_t    *src_line, *src;
3423     uint32_t mask;
3424     int32_t w;
3425     int dst_stride, src_stride;
3426
3427     __m128i xmm_mask;
3428     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3429     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3430     __m128i xmm_alpha_lo, xmm_alpha_hi;
3431
3432     PIXMAN_IMAGE_GET_LINE (
3433         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3434     PIXMAN_IMAGE_GET_LINE (
3435         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3436
3437     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3438
3439     xmm_mask = create_mask_16_128 (mask >> 24);
3440
3441     while (height--)
3442     {
3443         dst = dst_line;
3444         dst_line += dst_stride;
3445         src = src_line;
3446         src_line += src_stride;
3447         w = width;
3448
3449         /* call prefetch hint to optimize cache load*/
3450         cache_prefetch ((__m128i*)dst);
3451         cache_prefetch ((__m128i*)src);
3452
3453         while (w && (unsigned long)dst & 15)
3454         {
3455             uint32_t s = *src++;
3456             uint32_t d = *dst;
3457
3458             __m64 ms = unpack_32_1x64 (s);
3459             __m64 alpha    = expand_alpha_1x64 (ms);
3460             __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3461             __m64 alpha_dst = unpack_32_1x64 (d);
3462
3463             *dst++ = pack_1x64_32 (
3464                 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3465
3466             w--;
3467         }
3468
3469         /* call prefetch hint to optimize cache load*/
3470         cache_prefetch ((__m128i*)dst);
3471         cache_prefetch ((__m128i*)src);
3472
3473         while (w >= 4)
3474         {
3475             /* fill cache line with next memory */
3476             cache_prefetch_next ((__m128i*)dst);
3477             cache_prefetch_next ((__m128i*)src);
3478
3479             xmm_src = load_128_unaligned ((__m128i*)src);
3480             xmm_dst = load_128_aligned ((__m128i*)dst);
3481
3482             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3483             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3484             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3485                                 &xmm_alpha_lo, &xmm_alpha_hi);
3486
3487             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3488                            &xmm_alpha_lo, &xmm_alpha_hi,
3489                            &xmm_mask, &xmm_mask,
3490                            &xmm_dst_lo, &xmm_dst_hi);
3491
3492             save_128_aligned (
3493                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3494
3495             dst += 4;
3496             src += 4;
3497             w -= 4;
3498         }
3499
3500         while (w)
3501         {
3502             uint32_t s = *src++;
3503             uint32_t d = *dst;
3504
3505             __m64 ms = unpack_32_1x64 (s);
3506             __m64 alpha = expand_alpha_1x64 (ms);
3507             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3508             __m64 dest  = unpack_32_1x64 (d);
3509
3510             *dst++ = pack_1x64_32 (
3511                 in_over_1x64 (&ms, &alpha, &mask, &dest));
3512
3513             w--;
3514         }
3515     }
3516
3517     _mm_empty ();
3518 }
3519
3520 /* ---------------------------------------------------------------------
3521  * composite_over_x888_n_8888
3522  */
3523 static void
3524 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3525                                  pixman_op_t              op,
3526                                  pixman_image_t *         src_image,
3527                                  pixman_image_t *         mask_image,
3528                                  pixman_image_t *         dst_image,
3529                                  int32_t                  src_x,
3530                                  int32_t                  src_y,
3531                                  int32_t                  mask_x,
3532                                  int32_t                  mask_y,
3533                                  int32_t                  dest_x,
3534                                  int32_t                  dest_y,
3535                                  int32_t                  width,
3536                                  int32_t                  height)
3537 {
3538     uint32_t    *dst_line, *dst;
3539     uint32_t    *src_line, *src;
3540     uint32_t mask;
3541     int dst_stride, src_stride;
3542     int32_t w;
3543
3544     __m128i xmm_mask, xmm_alpha;
3545     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3546     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3547
3548     PIXMAN_IMAGE_GET_LINE (
3549         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3550     PIXMAN_IMAGE_GET_LINE (
3551         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3552
3553     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3554
3555     xmm_mask = create_mask_16_128 (mask >> 24);
3556     xmm_alpha = mask_00ff;
3557
3558     while (height--)
3559     {
3560         dst = dst_line;
3561         dst_line += dst_stride;
3562         src = src_line;
3563         src_line += src_stride;
3564         w = width;
3565
3566         /* call prefetch hint to optimize cache load*/
3567         cache_prefetch ((__m128i*)dst);
3568         cache_prefetch ((__m128i*)src);
3569
3570         while (w && (unsigned long)dst & 15)
3571         {
3572             uint32_t s = (*src++) | 0xff000000;
3573             uint32_t d = *dst;
3574
3575             __m64 src   = unpack_32_1x64 (s);
3576             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3577             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3578             __m64 dest  = unpack_32_1x64 (d);
3579
3580             *dst++ = pack_1x64_32 (
3581                 in_over_1x64 (&src, &alpha, &mask, &dest));
3582
3583             w--;
3584         }
3585
3586         /* call prefetch hint to optimize cache load*/
3587         cache_prefetch ((__m128i*)dst);
3588         cache_prefetch ((__m128i*)src);
3589
3590         while (w >= 4)
3591         {
3592             /* fill cache line with next memory */
3593             cache_prefetch_next ((__m128i*)dst);
3594             cache_prefetch_next ((__m128i*)src);
3595
3596             xmm_src = _mm_or_si128 (
3597                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3598             xmm_dst = load_128_aligned ((__m128i*)dst);
3599
3600             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3601             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3602
3603             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3604                            &xmm_alpha, &xmm_alpha,
3605                            &xmm_mask, &xmm_mask,
3606                            &xmm_dst_lo, &xmm_dst_hi);
3607
3608             save_128_aligned (
3609                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3610
3611             dst += 4;
3612             src += 4;
3613             w -= 4;
3614
3615         }
3616
3617         while (w)
3618         {
3619             uint32_t s = (*src++) | 0xff000000;
3620             uint32_t d = *dst;
3621
3622             __m64 src  = unpack_32_1x64 (s);
3623             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3624             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3625             __m64 dest  = unpack_32_1x64 (d);
3626
3627             *dst++ = pack_1x64_32 (
3628                 in_over_1x64 (&src, &alpha, &mask, &dest));
3629
3630             w--;
3631         }
3632     }
3633
3634     _mm_empty ();
3635 }
3636
3637 /* --------------------------------------------------------------------
3638  * composite_over_8888_8888
3639  */
3640 static void
3641 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3642                                pixman_op_t              op,
3643                                pixman_image_t *         src_image,
3644                                pixman_image_t *         mask_image,
3645                                pixman_image_t *         dst_image,
3646                                int32_t                  src_x,
3647                                int32_t                  src_y,
3648                                int32_t                  mask_x,
3649                                int32_t                  mask_y,
3650                                int32_t                  dest_x,
3651                                int32_t                  dest_y,
3652                                int32_t                  width,
3653                                int32_t                  height)
3654 {
3655     int dst_stride, src_stride;
3656     uint32_t    *dst_line, *dst;
3657     uint32_t    *src_line, *src;
3658
3659     PIXMAN_IMAGE_GET_LINE (
3660         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3661     PIXMAN_IMAGE_GET_LINE (
3662         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3663
3664     dst = dst_line;
3665     src = src_line;
3666
3667     while (height--)
3668     {
3669         core_combine_over_u_sse2 (dst, src, NULL, width);
3670
3671         dst += dst_stride;
3672         src += src_stride;
3673     }
3674     _mm_empty ();
3675 }
3676
3677 /* ------------------------------------------------------------------
3678  * composite_over_8888_0565
3679  */
3680 static force_inline uint16_t
3681 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3682 {
3683     __m64 ms;
3684
3685     ms = unpack_32_1x64 (src);
3686     return pack_565_32_16 (
3687         pack_1x64_32 (
3688             over_1x64 (
3689                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3690 }
3691
3692 static void
3693 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3694                                pixman_op_t              op,
3695                                pixman_image_t *         src_image,
3696                                pixman_image_t *         mask_image,
3697                                pixman_image_t *         dst_image,
3698                                int32_t                  src_x,
3699                                int32_t                  src_y,
3700                                int32_t                  mask_x,
3701                                int32_t                  mask_y,
3702                                int32_t                  dest_x,
3703                                int32_t                  dest_y,
3704                                int32_t                  width,
3705                                int32_t                  height)
3706 {
3707     uint16_t    *dst_line, *dst, d;
3708     uint32_t    *src_line, *src, s;
3709     int dst_stride, src_stride;
3710     int32_t w;
3711
3712     __m128i xmm_alpha_lo, xmm_alpha_hi;
3713     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3714     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3715
3716     PIXMAN_IMAGE_GET_LINE (
3717         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3718     PIXMAN_IMAGE_GET_LINE (
3719         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3720
3721 #if 0
3722     /* FIXME
3723      *
3724      * I copy the code from MMX one and keep the fixme.
3725      * If it's a problem there, probably is a problem here.
3726      */
3727     assert (src_image->drawable == mask_image->drawable);
3728 #endif
3729
3730     while (height--)
3731     {
3732         dst = dst_line;
3733         src = src_line;
3734
3735         /* call prefetch hint to optimize cache load*/
3736         cache_prefetch ((__m128i*)src);
3737         cache_prefetch ((__m128i*)dst);
3738
3739         dst_line += dst_stride;
3740         src_line += src_stride;
3741         w = width;
3742
3743         /* Align dst on a 16-byte boundary */
3744         while (w &&
3745                ((unsigned long)dst & 15))
3746         {
3747             s = *src++;
3748             d = *dst;
3749
3750             *dst++ = composite_over_8888_0565pixel (s, d);
3751             w--;
3752         }
3753
3754         /* call prefetch hint to optimize cache load*/
3755         cache_prefetch ((__m128i*)src);
3756         cache_prefetch ((__m128i*)dst);
3757
3758         /* It's a 8 pixel loop */
3759         while (w >= 8)
3760         {
3761             /* fill cache line with next memory */
3762             cache_prefetch_next ((__m128i*)src);
3763             cache_prefetch_next ((__m128i*)dst);
3764
3765             /* I'm loading unaligned because I'm not sure
3766              * about the address alignment.
3767              */
3768             xmm_src = load_128_unaligned ((__m128i*) src);
3769             xmm_dst = load_128_aligned ((__m128i*) dst);
3770
3771             /* Unpacking */
3772             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3773             unpack_565_128_4x128 (xmm_dst,
3774                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3775             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3776                                 &xmm_alpha_lo, &xmm_alpha_hi);
3777
3778             /* I'm loading next 4 pixels from memory
3779              * before to optimze the memory read.
3780              */
3781             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3782
3783             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3784                         &xmm_alpha_lo, &xmm_alpha_hi,
3785                         &xmm_dst0, &xmm_dst1);
3786
3787             /* Unpacking */
3788             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3789             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3790                                 &xmm_alpha_lo, &xmm_alpha_hi);
3791
3792             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3793                         &xmm_alpha_lo, &xmm_alpha_hi,
3794                         &xmm_dst2, &xmm_dst3);
3795
3796             save_128_aligned (
3797                 (__m128i*)dst, pack_565_4x128_128 (
3798                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3799
3800             w -= 8;
3801             dst += 8;
3802             src += 8;
3803         }
3804
3805         while (w--)
3806         {
3807             s = *src++;
3808             d = *dst;
3809
3810             *dst++ = composite_over_8888_0565pixel (s, d);
3811         }
3812     }
3813
3814     _mm_empty ();
3815 }
3816
3817 /* -----------------------------------------------------------------
3818  * composite_over_n_8_8888
3819  */
3820
3821 static void
3822 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3823                               pixman_op_t              op,
3824                               pixman_image_t *         src_image,
3825                               pixman_image_t *         mask_image,
3826                               pixman_image_t *         dst_image,
3827                               int32_t                  src_x,
3828                               int32_t                  src_y,
3829                               int32_t                  mask_x,
3830                               int32_t                  mask_y,
3831                               int32_t                  dest_x,
3832                               int32_t                  dest_y,
3833                               int32_t                  width,
3834                               int32_t                  height)
3835 {
3836     uint32_t src, srca;
3837     uint32_t *dst_line, *dst;
3838     uint8_t *mask_line, *mask;
3839     int dst_stride, mask_stride;
3840     int32_t w;
3841     uint32_t m, d;
3842
3843     __m128i xmm_src, xmm_alpha, xmm_def;
3844     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3845     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3846
3847     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3848
3849     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3850
3851     srca = src >> 24;
3852     if (src == 0)
3853         return;
3854
3855     PIXMAN_IMAGE_GET_LINE (
3856         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3857     PIXMAN_IMAGE_GET_LINE (
3858         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3859
3860     xmm_def = create_mask_2x32_128 (src, src);
3861     xmm_src = expand_pixel_32_1x128 (src);
3862     xmm_alpha = expand_alpha_1x128 (xmm_src);
3863     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3864     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3865
3866     while (height--)
3867     {
3868         dst = dst_line;
3869         dst_line += dst_stride;
3870         mask = mask_line;
3871         mask_line += mask_stride;
3872         w = width;
3873
3874         /* call prefetch hint to optimize cache load*/
3875         cache_prefetch ((__m128i*)mask);
3876         cache_prefetch ((__m128i*)dst);
3877
3878         while (w && (unsigned long)dst & 15)
3879         {
3880             uint8_t m = *mask++;
3881
3882             if (m)
3883             {
3884                 d = *dst;
3885                 mmx_mask = expand_pixel_8_1x64 (m);
3886                 mmx_dest = unpack_32_1x64 (d);
3887
3888                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3889                                                    &mmx_alpha,
3890                                                    &mmx_mask,
3891                                                    &mmx_dest));
3892             }
3893
3894             w--;
3895             dst++;
3896         }
3897
3898         /* call prefetch hint to optimize cache load*/
3899         cache_prefetch ((__m128i*)mask);
3900         cache_prefetch ((__m128i*)dst);
3901
3902         while (w >= 4)
3903         {
3904             /* fill cache line with next memory */
3905             cache_prefetch_next ((__m128i*)mask);
3906             cache_prefetch_next ((__m128i*)dst);
3907
3908             m = *((uint32_t*)mask);
3909
3910             if (srca == 0xff && m == 0xffffffff)
3911             {
3912                 save_128_aligned ((__m128i*)dst, xmm_def);
3913             }
3914             else if (m)
3915             {
3916                 xmm_dst = load_128_aligned ((__m128i*) dst);
3917                 xmm_mask = unpack_32_1x128 (m);
3918                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3919
3920                 /* Unpacking */
3921                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3922                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3923
3924                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3925                                         &xmm_mask_lo, &xmm_mask_hi);
3926
3927                 in_over_2x128 (&xmm_src, &xmm_src,
3928                                &xmm_alpha, &xmm_alpha,
3929                                &xmm_mask_lo, &xmm_mask_hi,
3930                                &xmm_dst_lo, &xmm_dst_hi);
3931
3932                 save_128_aligned (
3933                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3934             }
3935
3936             w -= 4;
3937             dst += 4;
3938             mask += 4;
3939         }
3940
3941         while (w)
3942         {
3943             uint8_t m = *mask++;
3944
3945             if (m)
3946             {
3947                 d = *dst;
3948                 mmx_mask = expand_pixel_8_1x64 (m);
3949                 mmx_dest = unpack_32_1x64 (d);
3950
3951                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3952                                                    &mmx_alpha,
3953                                                    &mmx_mask,
3954                                                    &mmx_dest));
3955             }
3956
3957             w--;
3958             dst++;
3959         }
3960     }
3961
3962     _mm_empty ();
3963 }
3964
3965 /* ----------------------------------------------------------------
3966  * composite_over_n_8_8888
3967  */
3968
3969 pixman_bool_t
3970 pixman_fill_sse2 (uint32_t *bits,
3971                   int       stride,
3972                   int       bpp,
3973                   int       x,
3974                   int       y,
3975                   int       width,
3976                   int       height,
3977                   uint32_t  data)
3978 {
3979     uint32_t byte_width;
3980     uint8_t         *byte_line;
3981
3982     __m128i xmm_def;
3983
3984     if (bpp == 8)
3985     {
3986         uint8_t b;
3987         uint16_t w;
3988
3989         stride = stride * (int) sizeof (uint32_t) / 1;
3990         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3991         byte_width = width;
3992         stride *= 1;
3993
3994         b = data & 0xff;
3995         w = (b << 8) | b;
3996         data = (w << 16) | w;
3997     }
3998     else if (bpp == 16)
3999     {
4000         stride = stride * (int) sizeof (uint32_t) / 2;
4001         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
4002         byte_width = 2 * width;
4003         stride *= 2;
4004
4005         data = (data & 0xffff) * 0x00010001;
4006     }
4007     else if (bpp == 32)
4008     {
4009         stride = stride * (int) sizeof (uint32_t) / 4;
4010         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
4011         byte_width = 4 * width;
4012         stride *= 4;
4013     }
4014     else
4015     {
4016         return FALSE;
4017     }
4018
4019     cache_prefetch ((__m128i*)byte_line);
4020     xmm_def = create_mask_2x32_128 (data, data);
4021
4022     while (height--)
4023     {
4024         int w;
4025         uint8_t *d = byte_line;
4026         byte_line += stride;
4027         w = byte_width;
4028
4029         cache_prefetch_next ((__m128i*)d);
4030
4031         while (w >= 1 && ((unsigned long)d & 1))
4032         {
4033             *(uint8_t *)d = data;
4034             w -= 1;
4035             d += 1;
4036         }
4037
4038         while (w >= 2 && ((unsigned long)d & 3))
4039         {
4040             *(uint16_t *)d = data;
4041             w -= 2;
4042             d += 2;
4043         }
4044
4045         while (w >= 4 && ((unsigned long)d & 15))
4046         {
4047             *(uint32_t *)d = data;
4048
4049             w -= 4;
4050             d += 4;
4051         }
4052
4053         cache_prefetch_next ((__m128i*)d);
4054
4055         while (w >= 128)
4056         {
4057             cache_prefetch (((__m128i*)d) + 12);
4058
4059             save_128_aligned ((__m128i*)(d),     xmm_def);
4060             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4061             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4062             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4063             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
4064             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
4065             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
4066             save_128_aligned ((__m128i*)(d + 112), xmm_def);
4067
4068             d += 128;
4069             w -= 128;
4070         }
4071
4072         if (w >= 64)
4073         {
4074             cache_prefetch (((__m128i*)d) + 8);
4075
4076             save_128_aligned ((__m128i*)(d),     xmm_def);
4077             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4078             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4079             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4080
4081             d += 64;
4082             w -= 64;
4083         }
4084
4085         cache_prefetch_next ((__m128i*)d);
4086
4087         if (w >= 32)
4088         {
4089             save_128_aligned ((__m128i*)(d),     xmm_def);
4090             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4091
4092             d += 32;
4093             w -= 32;
4094         }
4095
4096         if (w >= 16)
4097         {
4098             save_128_aligned ((__m128i*)(d),     xmm_def);
4099
4100             d += 16;
4101             w -= 16;
4102         }
4103
4104         cache_prefetch_next ((__m128i*)d);
4105
4106         while (w >= 4)
4107         {
4108             *(uint32_t *)d = data;
4109
4110             w -= 4;
4111             d += 4;
4112         }
4113
4114         if (w >= 2)
4115         {
4116             *(uint16_t *)d = data;
4117             w -= 2;
4118             d += 2;
4119         }
4120
4121         if (w >= 1)
4122         {
4123             *(uint8_t *)d = data;
4124             w -= 1;
4125             d += 1;
4126         }
4127     }
4128
4129     _mm_empty ();
4130     return TRUE;
4131 }
4132
4133 static void
4134 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4135                              pixman_op_t              op,
4136                              pixman_image_t *         src_image,
4137                              pixman_image_t *         mask_image,
4138                              pixman_image_t *         dst_image,
4139                              int32_t                  src_x,
4140                              int32_t                  src_y,
4141                              int32_t                  mask_x,
4142                              int32_t                  mask_y,
4143                              int32_t                  dest_x,
4144                              int32_t                  dest_y,
4145                              int32_t                  width,
4146                              int32_t                  height)
4147 {
4148     uint32_t src, srca;
4149     uint32_t    *dst_line, *dst;
4150     uint8_t     *mask_line, *mask;
4151     int dst_stride, mask_stride;
4152     int32_t w;
4153     uint32_t m;
4154
4155     __m128i xmm_src, xmm_def;
4156     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4157
4158     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4159
4160     srca = src >> 24;
4161     if (src == 0)
4162     {
4163         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4164                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
4165                           dest_x, dest_y, width, height, 0);
4166         return;
4167     }
4168
4169     PIXMAN_IMAGE_GET_LINE (
4170         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4171     PIXMAN_IMAGE_GET_LINE (
4172         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4173
4174     xmm_def = create_mask_2x32_128 (src, src);
4175     xmm_src = expand_pixel_32_1x128 (src);
4176
4177     while (height--)
4178     {
4179         dst = dst_line;
4180         dst_line += dst_stride;
4181         mask = mask_line;
4182         mask_line += mask_stride;
4183         w = width;
4184
4185         /* call prefetch hint to optimize cache load*/
4186         cache_prefetch ((__m128i*)mask);
4187         cache_prefetch ((__m128i*)dst);
4188
4189         while (w && (unsigned long)dst & 15)
4190         {
4191             uint8_t m = *mask++;
4192
4193             if (m)
4194             {
4195                 *dst = pack_1x64_32 (
4196                     pix_multiply_1x64 (
4197                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4198             }
4199             else
4200             {
4201                 *dst = 0;
4202             }
4203
4204             w--;
4205             dst++;
4206         }
4207
4208         /* call prefetch hint to optimize cache load*/
4209         cache_prefetch ((__m128i*)mask);
4210         cache_prefetch ((__m128i*)dst);
4211
4212         while (w >= 4)
4213         {
4214             /* fill cache line with next memory */
4215             cache_prefetch_next ((__m128i*)mask);
4216             cache_prefetch_next ((__m128i*)dst);
4217
4218             m = *((uint32_t*)mask);
4219
4220             if (srca == 0xff && m == 0xffffffff)
4221             {
4222                 save_128_aligned ((__m128i*)dst, xmm_def);
4223             }
4224             else if (m)
4225             {
4226                 xmm_mask = unpack_32_1x128 (m);
4227                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4228
4229                 /* Unpacking */
4230                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4231
4232                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4233                                         &xmm_mask_lo, &xmm_mask_hi);
4234
4235                 pix_multiply_2x128 (&xmm_src, &xmm_src,
4236                                     &xmm_mask_lo, &xmm_mask_hi,
4237                                     &xmm_mask_lo, &xmm_mask_hi);
4238
4239                 save_128_aligned (
4240                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4241             }
4242             else
4243             {
4244                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4245             }
4246
4247             w -= 4;
4248             dst += 4;
4249             mask += 4;
4250         }
4251
4252         while (w)
4253         {
4254             uint8_t m = *mask++;
4255
4256             if (m)
4257             {
4258                 *dst = pack_1x64_32 (
4259                     pix_multiply_1x64 (
4260                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4261             }
4262             else
4263             {
4264                 *dst = 0;
4265             }
4266
4267             w--;
4268             dst++;
4269         }
4270     }
4271
4272     _mm_empty ();
4273 }
4274
4275 /*-----------------------------------------------------------------------
4276  * composite_over_n_8_0565
4277  */
4278
4279 static void
4280 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4281                               pixman_op_t              op,
4282                               pixman_image_t *         src_image,
4283                               pixman_image_t *         mask_image,
4284                               pixman_image_t *         dst_image,
4285                               int32_t                  src_x,
4286                               int32_t                  src_y,
4287                               int32_t                  mask_x,
4288                               int32_t                  mask_y,
4289                               int32_t                  dest_x,
4290                               int32_t                  dest_y,
4291                               int32_t                  width,
4292                               int32_t                  height)
4293 {
4294     uint32_t src, srca;
4295     uint16_t    *dst_line, *dst, d;
4296     uint8_t     *mask_line, *mask;
4297     int dst_stride, mask_stride;
4298     int32_t w;
4299     uint32_t m;
4300     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4301
4302     __m128i xmm_src, xmm_alpha;
4303     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4304     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4305
4306     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4307
4308     srca = src >> 24;
4309     if (src == 0)
4310         return;
4311
4312     PIXMAN_IMAGE_GET_LINE (
4313         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4314     PIXMAN_IMAGE_GET_LINE (
4315         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4316
4317     xmm_src = expand_pixel_32_1x128 (src);
4318     xmm_alpha = expand_alpha_1x128 (xmm_src);
4319     mmx_src = _mm_movepi64_pi64 (xmm_src);
4320     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4321
4322     while (height--)
4323     {
4324         dst = dst_line;
4325         dst_line += dst_stride;
4326         mask = mask_line;
4327         mask_line += mask_stride;
4328         w = width;
4329
4330         /* call prefetch hint to optimize cache load*/
4331         cache_prefetch ((__m128i*)mask);
4332         cache_prefetch ((__m128i*)dst);
4333
4334         while (w && (unsigned long)dst & 15)
4335         {
4336             m = *mask++;
4337
4338             if (m)
4339             {
4340                 d = *dst;
4341                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4342                 mmx_dest = expand565_16_1x64 (d);
4343
4344                 *dst = pack_565_32_16 (
4345                     pack_1x64_32 (
4346                         in_over_1x64 (
4347                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4348             }
4349
4350             w--;
4351             dst++;
4352         }
4353
4354         /* call prefetch hint to optimize cache load*/
4355         cache_prefetch ((__m128i*)mask);
4356         cache_prefetch ((__m128i*)dst);
4357
4358         while (w >= 8)
4359         {
4360             /* fill cache line with next memory */
4361             cache_prefetch_next ((__m128i*)mask);
4362             cache_prefetch_next ((__m128i*)dst);
4363
4364             xmm_dst = load_128_aligned ((__m128i*) dst);
4365             unpack_565_128_4x128 (xmm_dst,
4366                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4367
4368             m = *((uint32_t*)mask);
4369             mask += 4;
4370
4371             if (m)
4372             {
4373                 xmm_mask = unpack_32_1x128 (m);
4374                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4375
4376                 /* Unpacking */
4377                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4378
4379                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4380                                         &xmm_mask_lo, &xmm_mask_hi);
4381
4382                 in_over_2x128 (&xmm_src, &xmm_src,
4383                                &xmm_alpha, &xmm_alpha,
4384                                &xmm_mask_lo, &xmm_mask_hi,
4385                                &xmm_dst0, &xmm_dst1);
4386             }
4387
4388             m = *((uint32_t*)mask);
4389             mask += 4;
4390
4391             if (m)
4392             {
4393                 xmm_mask = unpack_32_1x128 (m);
4394                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4395
4396                 /* Unpacking */
4397                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4398
4399                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4400                                         &xmm_mask_lo, &xmm_mask_hi);
4401                 in_over_2x128 (&xmm_src, &xmm_src,
4402                                &xmm_alpha, &xmm_alpha,
4403                                &xmm_mask_lo, &xmm_mask_hi,
4404                                &xmm_dst2, &xmm_dst3);
4405             }
4406
4407             save_128_aligned (
4408                 (__m128i*)dst, pack_565_4x128_128 (
4409                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4410
4411             w -= 8;
4412             dst += 8;
4413         }
4414
4415         while (w)
4416         {
4417             m = *mask++;
4418
4419             if (m)
4420             {
4421                 d = *dst;
4422                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4423                 mmx_dest = expand565_16_1x64 (d);
4424
4425                 *dst = pack_565_32_16 (
4426                     pack_1x64_32 (
4427                         in_over_1x64 (
4428                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4429             }
4430
4431             w--;
4432             dst++;
4433         }
4434     }
4435
4436     _mm_empty ();
4437 }
4438
4439 /* -----------------------------------------------------------------------
4440  * composite_over_pixbuf_0565
4441  */
4442
4443 static void
4444 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4445                                  pixman_op_t              op,
4446                                  pixman_image_t *         src_image,
4447                                  pixman_image_t *         mask_image,
4448                                  pixman_image_t *         dst_image,
4449                                  int32_t                  src_x,
4450                                  int32_t                  src_y,
4451                                  int32_t                  mask_x,
4452                                  int32_t                  mask_y,
4453                                  int32_t                  dest_x,
4454                                  int32_t                  dest_y,
4455                                  int32_t                  width,
4456                                  int32_t                  height)
4457 {
4458     uint16_t    *dst_line, *dst, d;
4459     uint32_t    *src_line, *src, s;
4460     int dst_stride, src_stride;
4461     int32_t w;
4462     uint32_t opaque, zero;
4463
4464     __m64 ms;
4465     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4466     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4467
4468     PIXMAN_IMAGE_GET_LINE (
4469         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4470     PIXMAN_IMAGE_GET_LINE (
4471         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4472
4473 #if 0
4474     /* FIXME
4475      *
4476      * I copy the code from MMX one and keep the fixme.
4477      * If it's a problem there, probably is a problem here.
4478      */
4479     assert (src_image->drawable == mask_image->drawable);
4480 #endif
4481
4482     while (height--)
4483     {
4484         dst = dst_line;
4485         dst_line += dst_stride;
4486         src = src_line;
4487         src_line += src_stride;
4488         w = width;
4489
4490         /* call prefetch hint to optimize cache load*/
4491         cache_prefetch ((__m128i*)src);
4492         cache_prefetch ((__m128i*)dst);
4493
4494         while (w && (unsigned long)dst & 15)
4495         {
4496             s = *src++;
4497             d = *dst;
4498
4499             ms = unpack_32_1x64 (s);
4500
4501             *dst++ = pack_565_32_16 (
4502                 pack_1x64_32 (
4503                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4504             w--;
4505         }
4506
4507         /* call prefetch hint to optimize cache load*/
4508         cache_prefetch ((__m128i*)src);
4509         cache_prefetch ((__m128i*)dst);
4510
4511         while (w >= 8)
4512         {
4513             /* fill cache line with next memory */
4514             cache_prefetch_next ((__m128i*)src);
4515             cache_prefetch_next ((__m128i*)dst);
4516
4517             /* First round */
4518             xmm_src = load_128_unaligned ((__m128i*)src);
4519             xmm_dst = load_128_aligned  ((__m128i*)dst);
4520
4521             opaque = is_opaque (xmm_src);
4522             zero = is_zero (xmm_src);
4523
4524             unpack_565_128_4x128 (xmm_dst,
4525                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4526             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4527
4528             /* preload next round*/
4529             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4530
4531             if (opaque)
4532             {
4533                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4534                                      &xmm_dst0, &xmm_dst1);
4535             }
4536             else if (!zero)
4537             {
4538                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4539                                         &xmm_dst0, &xmm_dst1);
4540             }
4541
4542             /* Second round */
4543             opaque = is_opaque (xmm_src);
4544             zero = is_zero (xmm_src);
4545
4546             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4547
4548             if (opaque)
4549             {
4550                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4551                                      &xmm_dst2, &xmm_dst3);
4552             }
4553             else if (!zero)
4554             {
4555                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4556                                         &xmm_dst2, &xmm_dst3);
4557             }
4558
4559             save_128_aligned (
4560                 (__m128i*)dst, pack_565_4x128_128 (
4561                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4562
4563             w -= 8;
4564             src += 8;
4565             dst += 8;
4566         }
4567
4568         while (w)
4569         {
4570             s = *src++;
4571             d = *dst;
4572
4573             ms = unpack_32_1x64 (s);
4574
4575             *dst++ = pack_565_32_16 (
4576                 pack_1x64_32 (
4577                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4578             w--;
4579         }
4580     }
4581
4582     _mm_empty ();
4583 }
4584
4585 /* -------------------------------------------------------------------------
4586  * composite_over_pixbuf_8888
4587  */
4588
4589 static void
4590 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4591                                  pixman_op_t              op,
4592                                  pixman_image_t *         src_image,
4593                                  pixman_image_t *         mask_image,
4594                                  pixman_image_t *         dst_image,
4595                                  int32_t                  src_x,
4596                                  int32_t                  src_y,
4597                                  int32_t                  mask_x,
4598                                  int32_t                  mask_y,
4599                                  int32_t                  dest_x,
4600                                  int32_t                  dest_y,
4601                                  int32_t                  width,
4602                                  int32_t                  height)
4603 {
4604     uint32_t    *dst_line, *dst, d;
4605     uint32_t    *src_line, *src, s;
4606     int dst_stride, src_stride;
4607     int32_t w;
4608     uint32_t opaque, zero;
4609
4610     __m128i xmm_src_lo, xmm_src_hi;
4611     __m128i xmm_dst_lo, xmm_dst_hi;
4612
4613     PIXMAN_IMAGE_GET_LINE (
4614         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4615     PIXMAN_IMAGE_GET_LINE (
4616         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4617
4618 #if 0
4619     /* FIXME
4620      *
4621      * I copy the code from MMX one and keep the fixme.
4622      * If it's a problem there, probably is a problem here.
4623      */
4624     assert (src_image->drawable == mask_image->drawable);
4625 #endif
4626
4627     while (height--)
4628     {
4629         dst = dst_line;
4630         dst_line += dst_stride;
4631         src = src_line;
4632         src_line += src_stride;
4633         w = width;
4634
4635         /* call prefetch hint to optimize cache load*/
4636         cache_prefetch ((__m128i*)src);
4637         cache_prefetch ((__m128i*)dst);
4638
4639         while (w && (unsigned long)dst & 15)
4640         {
4641             s = *src++;
4642             d = *dst;
4643
4644             *dst++ = pack_1x64_32 (
4645                 over_rev_non_pre_1x64 (
4646                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4647
4648             w--;
4649         }
4650
4651         /* call prefetch hint to optimize cache load*/
4652         cache_prefetch ((__m128i*)src);
4653         cache_prefetch ((__m128i*)dst);
4654
4655         while (w >= 4)
4656         {
4657             /* fill cache line with next memory */
4658             cache_prefetch_next ((__m128i*)src);
4659             cache_prefetch_next ((__m128i*)dst);
4660
4661             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4662
4663             opaque = is_opaque (xmm_src_hi);
4664             zero = is_zero (xmm_src_hi);
4665
4666             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4667
4668             if (opaque)
4669             {
4670                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4671                                      &xmm_dst_lo, &xmm_dst_hi);
4672
4673                 save_128_aligned (
4674                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4675             }
4676             else if (!zero)
4677             {
4678                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4679
4680                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4681
4682                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4683                                         &xmm_dst_lo, &xmm_dst_hi);
4684
4685                 save_128_aligned (
4686                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4687             }
4688
4689             w -= 4;
4690             dst += 4;
4691             src += 4;
4692         }
4693
4694         while (w)
4695         {
4696             s = *src++;
4697             d = *dst;
4698
4699             *dst++ = pack_1x64_32 (
4700                 over_rev_non_pre_1x64 (
4701                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4702
4703             w--;
4704         }
4705     }
4706
4707     _mm_empty ();
4708 }
4709
4710 /* -------------------------------------------------------------------------------------------------
4711  * composite_over_n_8888_0565_ca
4712  */
4713
4714 static void
4715 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4716                                     pixman_op_t              op,
4717                                     pixman_image_t *         src_image,
4718                                     pixman_image_t *         mask_image,
4719                                     pixman_image_t *         dst_image,
4720                                     int32_t                  src_x,
4721                                     int32_t                  src_y,
4722                                     int32_t                  mask_x,
4723                                     int32_t                  mask_y,
4724                                     int32_t                  dest_x,
4725                                     int32_t                  dest_y,
4726                                     int32_t                  width,
4727                                     int32_t                  height)
4728 {
4729     uint32_t src;
4730     uint16_t    *dst_line, *dst, d;
4731     uint32_t    *mask_line, *mask, m;
4732     int dst_stride, mask_stride;
4733     int w;
4734     uint32_t pack_cmp;
4735
4736     __m128i xmm_src, xmm_alpha;
4737     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4738     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4739
4740     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4741
4742     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4743
4744     if (src == 0)
4745         return;
4746
4747     PIXMAN_IMAGE_GET_LINE (
4748         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4749     PIXMAN_IMAGE_GET_LINE (
4750         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4751
4752     xmm_src = expand_pixel_32_1x128 (src);
4753     xmm_alpha = expand_alpha_1x128 (xmm_src);
4754     mmx_src = _mm_movepi64_pi64 (xmm_src);
4755     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4756
4757     while (height--)
4758     {
4759         w = width;
4760         mask = mask_line;
4761         dst = dst_line;
4762         mask_line += mask_stride;
4763         dst_line += dst_stride;
4764
4765         /* call prefetch hint to optimize cache load*/
4766         cache_prefetch ((__m128i*)mask);
4767         cache_prefetch ((__m128i*)dst);
4768
4769         while (w && ((unsigned long)dst & 15))
4770         {
4771             m = *(uint32_t *) mask;
4772
4773             if (m)
4774             {
4775                 d = *dst;
4776                 mmx_mask = unpack_32_1x64 (m);
4777                 mmx_dest = expand565_16_1x64 (d);
4778
4779                 *dst = pack_565_32_16 (
4780                     pack_1x64_32 (
4781                         in_over_1x64 (
4782                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4783             }
4784
4785             w--;
4786             dst++;
4787             mask++;
4788         }
4789
4790         /* call prefetch hint to optimize cache load*/
4791         cache_prefetch ((__m128i*)mask);
4792         cache_prefetch ((__m128i*)dst);
4793
4794         while (w >= 8)
4795         {
4796             /* fill cache line with next memory */
4797             cache_prefetch_next ((__m128i*)mask);
4798             cache_prefetch_next ((__m128i*)dst);
4799
4800             /* First round */
4801             xmm_mask = load_128_unaligned ((__m128i*)mask);
4802             xmm_dst = load_128_aligned ((__m128i*)dst);
4803
4804             pack_cmp = _mm_movemask_epi8 (
4805                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4806
4807             unpack_565_128_4x128 (xmm_dst,
4808                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4809             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4810
4811             /* preload next round */
4812             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4813
4814             /* preload next round */
4815             if (pack_cmp != 0xffff)
4816             {
4817                 in_over_2x128 (&xmm_src, &xmm_src,
4818                                &xmm_alpha, &xmm_alpha,
4819                                &xmm_mask_lo, &xmm_mask_hi,
4820                                &xmm_dst0, &xmm_dst1);
4821             }
4822
4823             /* Second round */
4824             pack_cmp = _mm_movemask_epi8 (
4825                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4826
4827             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4828
4829             if (pack_cmp != 0xffff)
4830             {
4831                 in_over_2x128 (&xmm_src, &xmm_src,
4832                                &xmm_alpha, &xmm_alpha,
4833                                &xmm_mask_lo, &xmm_mask_hi,
4834                                &xmm_dst2, &xmm_dst3);
4835             }
4836
4837             save_128_aligned (
4838                 (__m128i*)dst, pack_565_4x128_128 (
4839                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4840
4841             w -= 8;
4842             dst += 8;
4843             mask += 8;
4844         }
4845
4846         while (w)
4847         {
4848             m = *(uint32_t *) mask;
4849
4850             if (m)
4851             {
4852                 d = *dst;
4853                 mmx_mask = unpack_32_1x64 (m);
4854                 mmx_dest = expand565_16_1x64 (d);
4855
4856                 *dst = pack_565_32_16 (
4857                     pack_1x64_32 (
4858                         in_over_1x64 (
4859                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4860             }
4861
4862             w--;
4863             dst++;
4864             mask++;
4865         }
4866     }
4867
4868     _mm_empty ();
4869 }
4870
4871 /* -----------------------------------------------------------------------
4872  * composite_in_n_8_8
4873  */
4874
4875 static void
4876 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4877                          pixman_op_t              op,
4878                          pixman_image_t *         src_image,
4879                          pixman_image_t *         mask_image,
4880                          pixman_image_t *         dst_image,
4881                          int32_t                  src_x,
4882                          int32_t                  src_y,
4883                          int32_t                  mask_x,
4884                          int32_t                  mask_y,
4885                          int32_t                  dest_x,
4886                          int32_t                  dest_y,
4887                          int32_t                  width,
4888                          int32_t                  height)
4889 {
4890     uint8_t     *dst_line, *dst;
4891     uint8_t     *mask_line, *mask;
4892     int dst_stride, mask_stride;
4893     uint32_t d, m;
4894     uint32_t src;
4895     uint8_t sa;
4896     int32_t w;
4897
4898     __m128i xmm_alpha;
4899     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4900     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4901
4902     PIXMAN_IMAGE_GET_LINE (
4903         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4904     PIXMAN_IMAGE_GET_LINE (
4905         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4906
4907     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4908
4909     sa = src >> 24;
4910
4911     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4912
4913     while (height--)
4914     {
4915         dst = dst_line;
4916         dst_line += dst_stride;
4917         mask = mask_line;
4918         mask_line += mask_stride;
4919         w = width;
4920
4921         /* call prefetch hint to optimize cache load*/
4922         cache_prefetch ((__m128i*)mask);
4923         cache_prefetch ((__m128i*)dst);
4924
4925         while (w && ((unsigned long)dst & 15))
4926         {
4927             m = (uint32_t) *mask++;
4928             d = (uint32_t) *dst;
4929
4930             *dst++ = (uint8_t) pack_1x64_32 (
4931                 pix_multiply_1x64 (
4932                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4933                                        unpack_32_1x64 (m)),
4934                     unpack_32_1x64 (d)));
4935             w--;
4936         }
4937
4938         /* call prefetch hint to optimize cache load*/
4939         cache_prefetch ((__m128i*)mask);
4940         cache_prefetch ((__m128i*)dst);
4941
4942         while (w >= 16)
4943         {
4944             /* fill cache line with next memory */
4945             cache_prefetch_next ((__m128i*)mask);
4946             cache_prefetch_next ((__m128i*)dst);
4947
4948             xmm_mask = load_128_unaligned ((__m128i*)mask);
4949             xmm_dst = load_128_aligned ((__m128i*)dst);
4950
4951             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4952             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4953
4954             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4955                                 &xmm_mask_lo, &xmm_mask_hi,
4956                                 &xmm_mask_lo, &xmm_mask_hi);
4957
4958             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4959                                 &xmm_dst_lo, &xmm_dst_hi,
4960                                 &xmm_dst_lo, &xmm_dst_hi);
4961
4962             save_128_aligned (
4963                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4964
4965             mask += 16;
4966             dst += 16;
4967             w -= 16;
4968         }
4969
4970         while (w)
4971         {
4972             m = (uint32_t) *mask++;
4973             d = (uint32_t) *dst;
4974
4975             *dst++ = (uint8_t) pack_1x64_32 (
4976                 pix_multiply_1x64 (
4977                     pix_multiply_1x64 (
4978                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4979                     unpack_32_1x64 (d)));
4980             w--;
4981         }
4982     }
4983
4984     _mm_empty ();
4985 }
4986
4987 /* ---------------------------------------------------------------------------
4988  * composite_in_8_8
4989  */
4990
4991 static void
4992 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4993                        pixman_op_t              op,
4994                        pixman_image_t *         src_image,
4995                        pixman_image_t *         mask_image,
4996                        pixman_image_t *         dst_image,
4997                        int32_t                  src_x,
4998                        int32_t                  src_y,
4999                        int32_t                  mask_x,
5000                        int32_t                  mask_y,
5001                        int32_t                  dest_x,
5002                        int32_t                  dest_y,
5003                        int32_t                  width,
5004                        int32_t                  height)
5005 {
5006     uint8_t     *dst_line, *dst;
5007     uint8_t     *src_line, *src;
5008     int src_stride, dst_stride;
5009     int32_t w;
5010     uint32_t s, d;
5011
5012     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5013     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5014
5015     PIXMAN_IMAGE_GET_LINE (
5016         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5017     PIXMAN_IMAGE_GET_LINE (
5018         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5019
5020     while (height--)
5021     {
5022         dst = dst_line;
5023         dst_line += dst_stride;
5024         src = src_line;
5025         src_line += src_stride;
5026         w = width;
5027
5028         /* call prefetch hint to optimize cache load*/
5029         cache_prefetch ((__m128i*)src);
5030         cache_prefetch ((__m128i*)dst);
5031
5032         while (w && ((unsigned long)dst & 15))
5033         {
5034             s = (uint32_t) *src++;
5035             d = (uint32_t) *dst;
5036
5037             *dst++ = (uint8_t) pack_1x64_32 (
5038                 pix_multiply_1x64 (
5039                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
5040             w--;
5041         }
5042
5043         /* call prefetch hint to optimize cache load*/
5044         cache_prefetch ((__m128i*)src);
5045         cache_prefetch ((__m128i*)dst);
5046
5047         while (w >= 16)
5048         {
5049             /* fill cache line with next memory */
5050             cache_prefetch_next ((__m128i*)src);
5051             cache_prefetch_next ((__m128i*)dst);
5052
5053             xmm_src = load_128_unaligned ((__m128i*)src);
5054             xmm_dst = load_128_aligned ((__m128i*)dst);
5055
5056             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5057             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5058
5059             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5060                                 &xmm_dst_lo, &xmm_dst_hi,
5061                                 &xmm_dst_lo, &xmm_dst_hi);
5062
5063             save_128_aligned (
5064                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5065
5066             src += 16;
5067             dst += 16;
5068             w -= 16;
5069         }
5070
5071         while (w)
5072         {
5073             s = (uint32_t) *src++;
5074             d = (uint32_t) *dst;
5075
5076             *dst++ = (uint8_t) pack_1x64_32 (
5077                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5078             w--;
5079         }
5080     }
5081
5082     _mm_empty ();
5083 }
5084
5085 /* -------------------------------------------------------------------------
5086  * composite_add_n_8_8
5087  */
5088
5089 static void
5090 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
5091                           pixman_op_t              op,
5092                           pixman_image_t *         src_image,
5093                           pixman_image_t *         mask_image,
5094                           pixman_image_t *         dst_image,
5095                           int32_t                  src_x,
5096                           int32_t                  src_y,
5097                           int32_t                  mask_x,
5098                           int32_t                  mask_y,
5099                           int32_t                  dest_x,
5100                           int32_t                  dest_y,
5101                           int32_t                  width,
5102                           int32_t                  height)
5103 {
5104     uint8_t     *dst_line, *dst;
5105     uint8_t     *mask_line, *mask;
5106     int dst_stride, mask_stride;
5107     int32_t w;
5108     uint32_t src;
5109     uint8_t sa;
5110     uint32_t m, d;
5111
5112     __m128i xmm_alpha;
5113     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5114     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5115
5116     PIXMAN_IMAGE_GET_LINE (
5117         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5118     PIXMAN_IMAGE_GET_LINE (
5119         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5120
5121     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5122
5123     sa = src >> 24;
5124
5125     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5126
5127     while (height--)
5128     {
5129         dst = dst_line;
5130         dst_line += dst_stride;
5131         mask = mask_line;
5132         mask_line += mask_stride;
5133         w = width;
5134
5135         /* call prefetch hint to optimize cache load*/
5136         cache_prefetch ((__m128i*)mask);
5137         cache_prefetch ((__m128i*)dst);
5138
5139         while (w && ((unsigned long)dst & 15))
5140         {
5141             m = (uint32_t) *mask++;
5142             d = (uint32_t) *dst;
5143
5144             *dst++ = (uint8_t) pack_1x64_32 (
5145                 _mm_adds_pu16 (
5146                     pix_multiply_1x64 (
5147                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5148                     unpack_32_1x64 (d)));
5149             w--;
5150         }
5151
5152         /* call prefetch hint to optimize cache load*/
5153         cache_prefetch ((__m128i*)mask);
5154         cache_prefetch ((__m128i*)dst);
5155
5156         while (w >= 16)
5157         {
5158             /* fill cache line with next memory */
5159             cache_prefetch_next ((__m128i*)mask);
5160             cache_prefetch_next ((__m128i*)dst);
5161
5162             xmm_mask = load_128_unaligned ((__m128i*)mask);
5163             xmm_dst = load_128_aligned ((__m128i*)dst);
5164
5165             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5166             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5167
5168             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5169                                 &xmm_mask_lo, &xmm_mask_hi,
5170                                 &xmm_mask_lo, &xmm_mask_hi);
5171
5172             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5173             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5174
5175             save_128_aligned (
5176                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5177
5178             mask += 16;
5179             dst += 16;
5180             w -= 16;
5181         }
5182
5183         while (w)
5184         {
5185             m = (uint32_t) *mask++;
5186             d = (uint32_t) *dst;
5187
5188             *dst++ = (uint8_t) pack_1x64_32 (
5189                 _mm_adds_pu16 (
5190                     pix_multiply_1x64 (
5191                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5192                     unpack_32_1x64 (d)));
5193
5194             w--;
5195         }
5196     }
5197
5198     _mm_empty ();
5199 }
5200
5201 /* ----------------------------------------------------------------------
5202  * composite_add_8000_8000
5203  */
5204
5205 static void
5206 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5207                               pixman_op_t              op,
5208                               pixman_image_t *         src_image,
5209                               pixman_image_t *         mask_image,
5210                               pixman_image_t *         dst_image,
5211                               int32_t                  src_x,
5212                               int32_t                  src_y,
5213                               int32_t                  mask_x,
5214                               int32_t                  mask_y,
5215                               int32_t                  dest_x,
5216                               int32_t                  dest_y,
5217                               int32_t                  width,
5218                               int32_t                  height)
5219 {
5220     uint8_t     *dst_line, *dst;
5221     uint8_t     *src_line, *src;
5222     int dst_stride, src_stride;
5223     int32_t w;
5224     uint16_t t;
5225
5226     PIXMAN_IMAGE_GET_LINE (
5227         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5228     PIXMAN_IMAGE_GET_LINE (
5229         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5230
5231     while (height--)
5232     {
5233         dst = dst_line;
5234         src = src_line;
5235
5236         /* call prefetch hint to optimize cache load*/
5237         cache_prefetch ((__m128i*)src);
5238         cache_prefetch ((__m128i*)dst);
5239
5240         dst_line += dst_stride;
5241         src_line += src_stride;
5242         w = width;
5243
5244         /* Small head */
5245         while (w && (unsigned long)dst & 3)
5246         {
5247             t = (*dst) + (*src++);
5248             *dst++ = t | (0 - (t >> 8));
5249             w--;
5250         }
5251
5252         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5253
5254         /* Small tail */
5255         dst += w & 0xfffc;
5256         src += w & 0xfffc;
5257
5258         w &= 3;
5259
5260         while (w)
5261         {
5262             t = (*dst) + (*src++);
5263             *dst++ = t | (0 - (t >> 8));
5264             w--;
5265         }
5266     }
5267
5268     _mm_empty ();
5269 }
5270
5271 /* ---------------------------------------------------------------------
5272  * composite_add_8888_8888
5273  */
5274 static void
5275 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5276                               pixman_op_t              op,
5277                               pixman_image_t *         src_image,
5278                               pixman_image_t *         mask_image,
5279                               pixman_image_t *         dst_image,
5280                               int32_t                  src_x,
5281                               int32_t                  src_y,
5282                               int32_t                  mask_x,
5283                               int32_t                  mask_y,
5284                               int32_t                  dest_x,
5285                               int32_t                  dest_y,
5286                               int32_t                  width,
5287                               int32_t                  height)
5288 {
5289     uint32_t    *dst_line, *dst;
5290     uint32_t    *src_line, *src;
5291     int dst_stride, src_stride;
5292
5293     PIXMAN_IMAGE_GET_LINE (
5294         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5295     PIXMAN_IMAGE_GET_LINE (
5296         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5297
5298     while (height--)
5299     {
5300         dst = dst_line;
5301         dst_line += dst_stride;
5302         src = src_line;
5303         src_line += src_stride;
5304
5305         core_combine_add_u_sse2 (dst, src, NULL, width);
5306     }
5307
5308     _mm_empty ();
5309 }
5310
5311 /* -------------------------------------------------------------------------------------------------
5312  * sse2_composite_copy_area
5313  */
5314
5315 static pixman_bool_t
5316 pixman_blt_sse2 (uint32_t *src_bits,
5317                  uint32_t *dst_bits,
5318                  int       src_stride,
5319                  int       dst_stride,
5320                  int       src_bpp,
5321                  int       dst_bpp,
5322                  int       src_x,
5323                  int       src_y,
5324                  int       dst_x,
5325                  int       dst_y,
5326                  int       width,
5327                  int       height)
5328 {
5329     uint8_t *   src_bytes;
5330     uint8_t *   dst_bytes;
5331     int byte_width;
5332
5333     if (src_bpp != dst_bpp)
5334         return FALSE;
5335
5336     if (src_bpp == 16)
5337     {
5338         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5339         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5340         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5341         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5342         byte_width = 2 * width;
5343         src_stride *= 2;
5344         dst_stride *= 2;
5345     }
5346     else if (src_bpp == 32)
5347     {
5348         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5349         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5350         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5351         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5352         byte_width = 4 * width;
5353         src_stride *= 4;
5354         dst_stride *= 4;
5355     }
5356     else
5357     {
5358         return FALSE;
5359     }
5360
5361     cache_prefetch ((__m128i*)src_bytes);
5362     cache_prefetch ((__m128i*)dst_bytes);
5363
5364     while (height--)
5365     {
5366         int w;
5367         uint8_t *s = src_bytes;
5368         uint8_t *d = dst_bytes;
5369         src_bytes += src_stride;
5370         dst_bytes += dst_stride;
5371         w = byte_width;
5372
5373         cache_prefetch_next ((__m128i*)s);
5374         cache_prefetch_next ((__m128i*)d);
5375
5376         while (w >= 2 && ((unsigned long)d & 3))
5377         {
5378             *(uint16_t *)d = *(uint16_t *)s;
5379             w -= 2;
5380             s += 2;
5381             d += 2;
5382         }
5383
5384         while (w >= 4 && ((unsigned long)d & 15))
5385         {
5386             *(uint32_t *)d = *(uint32_t *)s;
5387
5388             w -= 4;
5389             s += 4;
5390             d += 4;
5391         }
5392
5393         cache_prefetch_next ((__m128i*)s);
5394         cache_prefetch_next ((__m128i*)d);
5395
5396         while (w >= 64)
5397         {
5398             __m128i xmm0, xmm1, xmm2, xmm3;
5399
5400             /* 128 bytes ahead */
5401             cache_prefetch (((__m128i*)s) + 8);
5402             cache_prefetch (((__m128i*)d) + 8);
5403
5404             xmm0 = load_128_unaligned ((__m128i*)(s));
5405             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5406             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5407             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5408
5409             save_128_aligned ((__m128i*)(d),    xmm0);
5410             save_128_aligned ((__m128i*)(d + 16), xmm1);
5411             save_128_aligned ((__m128i*)(d + 32), xmm2);
5412             save_128_aligned ((__m128i*)(d + 48), xmm3);
5413
5414             s += 64;
5415             d += 64;
5416             w -= 64;
5417         }
5418
5419         cache_prefetch_next ((__m128i*)s);
5420         cache_prefetch_next ((__m128i*)d);
5421
5422         while (w >= 16)
5423         {
5424             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5425
5426             w -= 16;
5427             d += 16;
5428             s += 16;
5429         }
5430
5431         cache_prefetch_next ((__m128i*)s);
5432         cache_prefetch_next ((__m128i*)d);
5433
5434         while (w >= 4)
5435         {
5436             *(uint32_t *)d = *(uint32_t *)s;
5437
5438             w -= 4;
5439             s += 4;
5440             d += 4;
5441         }
5442
5443         if (w >= 2)
5444         {
5445             *(uint16_t *)d = *(uint16_t *)s;
5446             w -= 2;
5447             s += 2;
5448             d += 2;
5449         }
5450     }
5451
5452     _mm_empty ();
5453
5454     return TRUE;
5455 }
5456
5457 static void
5458 sse2_composite_copy_area (pixman_implementation_t *imp,
5459                           pixman_op_t              op,
5460                           pixman_image_t *         src_image,
5461                           pixman_image_t *         mask_image,
5462                           pixman_image_t *         dst_image,
5463                           int32_t                  src_x,
5464                           int32_t                  src_y,
5465                           int32_t                  mask_x,
5466                           int32_t                  mask_y,
5467                           int32_t                  dest_x,
5468                           int32_t                  dest_y,
5469                           int32_t                  width,
5470                           int32_t                  height)
5471 {
5472     pixman_blt_sse2 (src_image->bits.bits,
5473                      dst_image->bits.bits,
5474                      src_image->bits.rowstride,
5475                      dst_image->bits.rowstride,
5476                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5477                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5478                      src_x, src_y, dest_x, dest_y, width, height);
5479 }
5480
5481 static void
5482 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5483                                  pixman_op_t              op,
5484                                  pixman_image_t *         src_image,
5485                                  pixman_image_t *         mask_image,
5486                                  pixman_image_t *         dst_image,
5487                                  int32_t                  src_x,
5488                                  int32_t                  src_y,
5489                                  int32_t                  mask_x,
5490                                  int32_t                  mask_y,
5491                                  int32_t                  dest_x,
5492                                  int32_t                  dest_y,
5493                                  int32_t                  width,
5494                                  int32_t                  height)
5495 {
5496     uint32_t    *src, *src_line, s;
5497     uint32_t    *dst, *dst_line, d;
5498     uint8_t         *mask, *mask_line;
5499     uint32_t m;
5500     int src_stride, mask_stride, dst_stride;
5501     int32_t w;
5502     __m64 ms;
5503
5504     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5505     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5506     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5507
5508     PIXMAN_IMAGE_GET_LINE (
5509         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5510     PIXMAN_IMAGE_GET_LINE (
5511         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5512     PIXMAN_IMAGE_GET_LINE (
5513         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5514
5515     while (height--)
5516     {
5517         src = src_line;
5518         src_line += src_stride;
5519         dst = dst_line;
5520         dst_line += dst_stride;
5521         mask = mask_line;
5522         mask_line += mask_stride;
5523
5524         w = width;
5525
5526         /* call prefetch hint to optimize cache load*/
5527         cache_prefetch ((__m128i*)src);
5528         cache_prefetch ((__m128i*)dst);
5529         cache_prefetch ((__m128i*)mask);
5530
5531         while (w && (unsigned long)dst & 15)
5532         {
5533             s = 0xff000000 | *src++;
5534             m = (uint32_t) *mask++;
5535             d = *dst;
5536             ms = unpack_32_1x64 (s);
5537
5538             if (m != 0xff)
5539             {
5540                 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5541                 __m64 md = unpack_32_1x64 (d);
5542
5543                 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5544             }
5545
5546             *dst++ = pack_1x64_32 (ms);
5547             w--;
5548         }
5549
5550         /* call prefetch hint to optimize cache load*/
5551         cache_prefetch ((__m128i*)src);
5552         cache_prefetch ((__m128i*)dst);
5553         cache_prefetch ((__m128i*)mask);
5554
5555         while (w >= 4)
5556         {
5557             /* fill cache line with next memory */
5558             cache_prefetch_next ((__m128i*)src);
5559             cache_prefetch_next ((__m128i*)dst);
5560             cache_prefetch_next ((__m128i*)mask);
5561
5562             m = *(uint32_t*) mask;
5563             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5564
5565             if (m == 0xffffffff)
5566             {
5567                 save_128_aligned ((__m128i*)dst, xmm_src);
5568             }
5569             else
5570             {
5571                 xmm_dst = load_128_aligned ((__m128i*)dst);
5572
5573                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5574
5575                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5576                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5577                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5578
5579                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5580
5581                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5582
5583                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5584             }
5585
5586             src += 4;
5587             dst += 4;
5588             mask += 4;
5589             w -= 4;
5590         }
5591
5592         while (w)
5593         {
5594             m = (uint32_t) *mask++;
5595
5596             if (m)
5597             {
5598                 s = 0xff000000 | *src;
5599
5600                 if (m == 0xff)
5601                 {
5602                     *dst = s;
5603                 }
5604                 else
5605                 {
5606                     __m64 ma, md, ms;
5607
5608                     d = *dst;
5609
5610                     ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5611                     md = unpack_32_1x64 (d);
5612                     ms = unpack_32_1x64 (s);
5613
5614                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5615                 }
5616
5617             }
5618
5619             src++;
5620             dst++;
5621             w--;
5622         }
5623     }
5624
5625     _mm_empty ();
5626 }
5627
5628 static void
5629 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5630                                  pixman_op_t              op,
5631                                  pixman_image_t *         src_image,
5632                                  pixman_image_t *         mask_image,
5633                                  pixman_image_t *         dst_image,
5634                                  int32_t                  src_x,
5635                                  int32_t                  src_y,
5636                                  int32_t                  mask_x,
5637                                  int32_t                  mask_y,
5638                                  int32_t                  dest_x,
5639                                  int32_t                  dest_y,
5640                                  int32_t                  width,
5641                                  int32_t                  height)
5642 {
5643     uint32_t    *src, *src_line, s;
5644     uint32_t    *dst, *dst_line, d;
5645     uint8_t         *mask, *mask_line;
5646     uint32_t m;
5647     int src_stride, mask_stride, dst_stride;
5648     int32_t w;
5649
5650     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5651     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5652     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5653
5654     PIXMAN_IMAGE_GET_LINE (
5655         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5656     PIXMAN_IMAGE_GET_LINE (
5657         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5658     PIXMAN_IMAGE_GET_LINE (
5659         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5660
5661     while (height--)
5662     {
5663         src = src_line;
5664         src_line += src_stride;
5665         dst = dst_line;
5666         dst_line += dst_stride;
5667         mask = mask_line;
5668         mask_line += mask_stride;
5669
5670         w = width;
5671
5672         /* call prefetch hint to optimize cache load*/
5673         cache_prefetch ((__m128i *)src);
5674         cache_prefetch ((__m128i *)dst);
5675         cache_prefetch ((__m128i *)mask);
5676
5677         while (w && (unsigned long)dst & 15)
5678         {
5679             uint32_t sa;
5680
5681             s = *src++;
5682             m = (uint32_t) *mask++;
5683             d = *dst;
5684
5685             sa = s >> 24;
5686
5687             if (m)
5688             {
5689                 if (sa == 0xff && m == 0xff)
5690                 {
5691                     *dst = s;
5692                 }
5693                 else
5694                 {
5695                     __m64 ms, md, ma, msa;
5696
5697                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5698                     ms = unpack_32_1x64 (s);
5699                     md = unpack_32_1x64 (d);
5700
5701                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5702
5703                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5704                 }
5705             }
5706
5707             dst++;
5708             w--;
5709         }
5710
5711         /* call prefetch hint to optimize cache load*/
5712         cache_prefetch ((__m128i *)src);
5713         cache_prefetch ((__m128i *)dst);
5714         cache_prefetch ((__m128i *)mask);
5715
5716         while (w >= 4)
5717         {
5718             /* fill cache line with next memory */
5719             cache_prefetch_next ((__m128i *)src);
5720             cache_prefetch_next ((__m128i *)dst);
5721             cache_prefetch_next ((__m128i *)mask);
5722
5723             m = *(uint32_t *) mask;
5724
5725             if (m)
5726             {
5727                 xmm_src = load_128_unaligned ((__m128i*)src);
5728
5729                 if (m == 0xffffffff && is_opaque (xmm_src))
5730                 {
5731                     save_128_aligned ((__m128i *)dst, xmm_src);
5732                 }
5733                 else
5734                 {
5735                     xmm_dst = load_128_aligned ((__m128i *)dst);
5736
5737                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5738
5739                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5740                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5741                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5742
5743                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5744                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5745
5746                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5747                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5748
5749                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5750                 }
5751             }
5752
5753             src += 4;
5754             dst += 4;
5755             mask += 4;
5756             w -= 4;
5757         }
5758
5759         while (w)
5760         {
5761             uint32_t sa;
5762
5763             s = *src++;
5764             m = (uint32_t) *mask++;
5765             d = *dst;
5766
5767             sa = s >> 24;
5768
5769             if (m)
5770             {
5771                 if (sa == 0xff && m == 0xff)
5772                 {
5773                     *dst = s;
5774                 }
5775                 else
5776                 {
5777                     __m64 ms, md, ma, msa;
5778
5779                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5780                     ms = unpack_32_1x64 (s);
5781                     md = unpack_32_1x64 (d);
5782
5783                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5784
5785                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5786                 }
5787             }
5788
5789             dst++;
5790             w--;
5791         }
5792     }
5793
5794     _mm_empty ();
5795 }
5796
5797 static void
5798 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5799                                     pixman_op_t              op,
5800                                     pixman_image_t *         src_image,
5801                                     pixman_image_t *         mask_image,
5802                                     pixman_image_t *         dst_image,
5803                                     int32_t                  src_x,
5804                                     int32_t                  src_y,
5805                                     int32_t                  mask_x,
5806                                     int32_t                  mask_y,
5807                                     int32_t                  dest_x,
5808                                     int32_t                  dest_y,
5809                                     int32_t                  width,
5810                                     int32_t                  height)
5811 {
5812     uint32_t src;
5813     uint32_t    *dst_line, *dst;
5814     __m128i xmm_src;
5815     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5816     __m128i xmm_dsta_hi, xmm_dsta_lo;
5817     int dst_stride;
5818     int32_t w;
5819
5820     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5821
5822     if (src == 0)
5823         return;
5824
5825     PIXMAN_IMAGE_GET_LINE (
5826         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5827
5828     xmm_src = expand_pixel_32_1x128 (src);
5829
5830     while (height--)
5831     {
5832         dst = dst_line;
5833
5834         /* call prefetch hint to optimize cache load*/
5835         cache_prefetch ((__m128i*)dst);
5836
5837         dst_line += dst_stride;
5838         w = width;
5839
5840         while (w && (unsigned long)dst & 15)
5841         {
5842             __m64 vd;
5843
5844             vd = unpack_32_1x64 (*dst);
5845
5846             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5847                                             _mm_movepi64_pi64 (xmm_src)));
5848             w--;
5849             dst++;
5850         }
5851
5852         cache_prefetch ((__m128i*)dst);
5853
5854         while (w >= 4)
5855         {
5856             __m128i tmp_lo, tmp_hi;
5857
5858             /* fill cache line with next memory */
5859             cache_prefetch_next ((__m128i*)(dst + 4));
5860
5861             xmm_dst = load_128_aligned ((__m128i*)dst);
5862
5863             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5864             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5865
5866             tmp_lo = xmm_src;
5867             tmp_hi = xmm_src;
5868
5869             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5870                         &xmm_dsta_lo, &xmm_dsta_hi,
5871                         &tmp_lo, &tmp_hi);
5872
5873             save_128_aligned (
5874                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5875
5876             w -= 4;
5877             dst += 4;
5878         }
5879
5880         while (w)
5881         {
5882             __m64 vd;
5883
5884             vd = unpack_32_1x64 (*dst);
5885
5886             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5887                                             _mm_movepi64_pi64 (xmm_src)));
5888             w--;
5889             dst++;
5890         }
5891
5892     }
5893
5894     _mm_empty ();
5895 }
5896
5897 static void
5898 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5899                                     pixman_op_t              op,
5900                                     pixman_image_t *         src_image,
5901                                     pixman_image_t *         mask_image,
5902                                     pixman_image_t *         dst_image,
5903                                     int32_t                  src_x,
5904                                     int32_t                  src_y,
5905                                     int32_t                  mask_x,
5906                                     int32_t                  mask_y,
5907                                     int32_t                  dest_x,
5908                                     int32_t                  dest_y,
5909                                     int32_t                  width,
5910                                     int32_t                  height)
5911 {
5912     uint32_t    *src, *src_line, s;
5913     uint32_t    *dst, *dst_line, d;
5914     uint32_t    *mask, *mask_line;
5915     uint32_t    m;
5916     int src_stride, mask_stride, dst_stride;
5917     int32_t w;
5918
5919     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5920     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5921     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5922
5923     PIXMAN_IMAGE_GET_LINE (
5924         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5925     PIXMAN_IMAGE_GET_LINE (
5926         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5927     PIXMAN_IMAGE_GET_LINE (
5928         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5929
5930     while (height--)
5931     {
5932         src = src_line;
5933         src_line += src_stride;
5934         dst = dst_line;
5935         dst_line += dst_stride;
5936         mask = mask_line;
5937         mask_line += mask_stride;
5938
5939         w = width;
5940
5941         /* call prefetch hint to optimize cache load*/
5942         cache_prefetch ((__m128i *)src);
5943         cache_prefetch ((__m128i *)dst);
5944         cache_prefetch ((__m128i *)mask);
5945
5946         while (w && (unsigned long)dst & 15)
5947         {
5948             uint32_t sa;
5949
5950             s = *src++;
5951             m = (*mask++) >> 24;
5952             d = *dst;
5953
5954             sa = s >> 24;
5955
5956             if (m)
5957             {
5958                 if (sa == 0xff && m == 0xff)
5959                 {
5960                     *dst = s;
5961                 }
5962                 else
5963                 {
5964                     __m64 ms, md, ma, msa;
5965
5966                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5967                     ms = unpack_32_1x64 (s);
5968                     md = unpack_32_1x64 (d);
5969
5970                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5971
5972                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5973                 }
5974             }
5975
5976             dst++;
5977             w--;
5978         }
5979
5980         /* call prefetch hint to optimize cache load*/
5981         cache_prefetch ((__m128i *)src);
5982         cache_prefetch ((__m128i *)dst);
5983         cache_prefetch ((__m128i *)mask);
5984
5985         while (w >= 4)
5986         {
5987             /* fill cache line with next memory */
5988             cache_prefetch_next ((__m128i *)src);
5989             cache_prefetch_next ((__m128i *)dst);
5990             cache_prefetch_next ((__m128i *)mask);
5991
5992             xmm_mask = load_128_unaligned ((__m128i*)mask);
5993
5994             if (!is_transparent (xmm_mask))
5995             {
5996                 xmm_src = load_128_unaligned ((__m128i*)src);
5997
5998                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5999                 {
6000                     save_128_aligned ((__m128i *)dst, xmm_src);
6001                 }
6002                 else
6003                 {
6004                     xmm_dst = load_128_aligned ((__m128i *)dst);
6005
6006                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6007                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
6008                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6009
6010                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
6011                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
6012
6013                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
6014                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
6015
6016                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6017                 }
6018             }
6019
6020             src += 4;
6021             dst += 4;
6022             mask += 4;
6023             w -= 4;
6024         }
6025
6026         while (w)
6027         {
6028             uint32_t sa;
6029
6030             s = *src++;
6031             m = (*mask++) >> 24;
6032             d = *dst;
6033
6034             sa = s >> 24;
6035
6036             if (m)
6037             {
6038                 if (sa == 0xff && m == 0xff)
6039                 {
6040                     *dst = s;
6041                 }
6042                 else
6043                 {
6044                     __m64 ms, md, ma, msa;
6045
6046                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
6047                     ms = unpack_32_1x64 (s);
6048                     md = unpack_32_1x64 (d);
6049
6050                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
6051
6052                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
6053                 }
6054             }
6055
6056             dst++;
6057             w--;
6058         }
6059     }
6060
6061     _mm_empty ();
6062 }
6063
6064 static const pixman_fast_path_t sse2_fast_paths[] =
6065 {
6066     /* PIXMAN_OP_OVER */
6067     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6068     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6069     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6070     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6071     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6072     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6073     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6074     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6075     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6076     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6077     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6078     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6079     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6080     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6081     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6082     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6083     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6084     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6085     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6086     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6087     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6088     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6089     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6090     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6091     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6092     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6093     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6094     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6095     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6096     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6097     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6098     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6099     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6100     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6101     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6102     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6103     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6104     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6105     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6106     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6107     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6108     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6109     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6110     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6111     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6112     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6113
6114     /* PIXMAN_OP_OVER_REVERSE */
6115     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6116     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6117
6118     /* PIXMAN_OP_ADD */
6119     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6120     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
6121     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6122     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6123     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6124
6125     /* PIXMAN_OP_SRC */
6126     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6127     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6128     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6129     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6130     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6131     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6132     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6133     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6134     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6135     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6136     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6137     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6138
6139     /* PIXMAN_OP_IN */
6140     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6141     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6142
6143     { PIXMAN_OP_NONE },
6144 };
6145
6146 static pixman_bool_t
6147 sse2_blt (pixman_implementation_t *imp,
6148           uint32_t *               src_bits,
6149           uint32_t *               dst_bits,
6150           int                      src_stride,
6151           int                      dst_stride,
6152           int                      src_bpp,
6153           int                      dst_bpp,
6154           int                      src_x,
6155           int                      src_y,
6156           int                      dst_x,
6157           int                      dst_y,
6158           int                      width,
6159           int                      height)
6160 {
6161     if (!pixman_blt_sse2 (
6162             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6163             src_x, src_y, dst_x, dst_y, width, height))
6164
6165     {
6166         return _pixman_implementation_blt (
6167             imp->delegate,
6168             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6169             src_x, src_y, dst_x, dst_y, width, height);
6170     }
6171
6172     return TRUE;
6173 }
6174
6175 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6176 __attribute__((__force_align_arg_pointer__))
6177 #endif
6178 static pixman_bool_t
6179 sse2_fill (pixman_implementation_t *imp,
6180            uint32_t *               bits,
6181            int                      stride,
6182            int                      bpp,
6183            int                      x,
6184            int                      y,
6185            int                      width,
6186            int                      height,
6187            uint32_t xor)
6188 {
6189     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
6190     {
6191         return _pixman_implementation_fill (
6192             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
6193     }
6194
6195     return TRUE;
6196 }
6197
6198 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6199 __attribute__((__force_align_arg_pointer__))
6200 #endif
6201 pixman_implementation_t *
6202 _pixman_implementation_create_sse2 (void)
6203 {
6204 #ifdef USE_MMX
6205     pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
6206 #else
6207     pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
6208 #endif
6209     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6210
6211     /* SSE2 constants */
6212     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6213     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6214     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6215     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6216     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6217     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6218     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6219     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6220     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6221     mask_0080 = create_mask_16_128 (0x0080);
6222     mask_00ff = create_mask_16_128 (0x00ff);
6223     mask_0101 = create_mask_16_128 (0x0101);
6224     mask_ffff = create_mask_16_128 (0xffff);
6225     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6226     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6227
6228     /* MMX constants */
6229     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
6230     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
6231
6232     mask_x0080 = create_mask_16_64 (0x0080);
6233     mask_x00ff = create_mask_16_64 (0x00ff);
6234     mask_x0101 = create_mask_16_64 (0x0101);
6235     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
6236
6237     _mm_empty ();
6238
6239     /* Set up function pointers */
6240
6241     /* SSE code patch for fbcompose.c */
6242     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6243     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6244     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6245     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6246     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6247     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6248     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6249     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6250     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6251     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6252
6253     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6254
6255     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6256     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6257     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6258     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6259     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6260     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6261     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6262     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6263     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6264     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6265     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6266
6267     imp->blt = sse2_blt;
6268     imp->fill = sse2_fill;
6269
6270     return imp;
6271 }
6272
6273 #endif /* USE_SSE2 */