SSE2 optimization for scaled over_8888_8888 operation with nearest filter
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38 #include "pixman-fast-path.h"
39
40 #if defined(_MSC_VER) && defined(_M_AMD64)
41 /* Windows 64 doesn't allow MMX to be used, so
42  * the pixman-x64-mmx-emulation.h file contains
43  * implementations of those MMX intrinsics that
44  * are used in the SSE2 implementation.
45  */
46 #   include "pixman-x64-mmx-emulation.h"
47 #endif
48
49 #ifdef USE_SSE2
50
51 /* --------------------------------------------------------------------
52  * Locals
53  */
54
55 static __m64 mask_x0080;
56 static __m64 mask_x00ff;
57 static __m64 mask_x0101;
58 static __m64 mask_x_alpha;
59
60 static __m64 mask_x565_rgb;
61 static __m64 mask_x565_unpack;
62
63 static __m128i mask_0080;
64 static __m128i mask_00ff;
65 static __m128i mask_0101;
66 static __m128i mask_ffff;
67 static __m128i mask_ff000000;
68 static __m128i mask_alpha;
69
70 static __m128i mask_565_r;
71 static __m128i mask_565_g1, mask_565_g2;
72 static __m128i mask_565_b;
73 static __m128i mask_red;
74 static __m128i mask_green;
75 static __m128i mask_blue;
76
77 static __m128i mask_565_fix_rb;
78 static __m128i mask_565_fix_g;
79
80 /* ----------------------------------------------------------------------
81  * SSE2 Inlines
82  */
83 static force_inline __m128i
84 unpack_32_1x128 (uint32_t data)
85 {
86     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
87 }
88
89 static force_inline void
90 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
91 {
92     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
93     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
94 }
95
96 static force_inline __m128i
97 unpack_565_to_8888 (__m128i lo)
98 {
99     __m128i r, g, b, rb, t;
100
101     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
102     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
103     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
104
105     rb = _mm_or_si128 (r, b);
106     t  = _mm_and_si128 (rb, mask_565_fix_rb);
107     t  = _mm_srli_epi32 (t, 5);
108     rb = _mm_or_si128 (rb, t);
109
110     t  = _mm_and_si128 (g, mask_565_fix_g);
111     t  = _mm_srli_epi32 (t, 6);
112     g  = _mm_or_si128 (g, t);
113
114     return _mm_or_si128 (rb, g);
115 }
116
117 static force_inline void
118 unpack_565_128_4x128 (__m128i  data,
119                       __m128i* data0,
120                       __m128i* data1,
121                       __m128i* data2,
122                       __m128i* data3)
123 {
124     __m128i lo, hi;
125
126     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
127     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
128
129     lo = unpack_565_to_8888 (lo);
130     hi = unpack_565_to_8888 (hi);
131
132     unpack_128_2x128 (lo, data0, data1);
133     unpack_128_2x128 (hi, data2, data3);
134 }
135
136 static force_inline uint16_t
137 pack_565_32_16 (uint32_t pixel)
138 {
139     return (uint16_t) (((pixel >> 8) & 0xf800) |
140                        ((pixel >> 5) & 0x07e0) |
141                        ((pixel >> 3) & 0x001f));
142 }
143
144 static force_inline __m128i
145 pack_2x128_128 (__m128i lo, __m128i hi)
146 {
147     return _mm_packus_epi16 (lo, hi);
148 }
149
150 static force_inline __m128i
151 pack_565_2x128_128 (__m128i lo, __m128i hi)
152 {
153     __m128i data;
154     __m128i r, g1, g2, b;
155
156     data = pack_2x128_128 (lo, hi);
157
158     r  = _mm_and_si128 (data, mask_565_r);
159     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
160     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
161     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
162
163     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
164 }
165
166 static force_inline __m128i
167 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
168 {
169     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
170                              pack_565_2x128_128 (*xmm2, *xmm3));
171 }
172
173 static force_inline int
174 is_opaque (__m128i x)
175 {
176     __m128i ffs = _mm_cmpeq_epi8 (x, x);
177
178     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
179 }
180
181 static force_inline int
182 is_zero (__m128i x)
183 {
184     return _mm_movemask_epi8 (
185         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
186 }
187
188 static force_inline int
189 is_transparent (__m128i x)
190 {
191     return (_mm_movemask_epi8 (
192                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
193 }
194
195 static force_inline __m128i
196 expand_pixel_32_1x128 (uint32_t data)
197 {
198     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
199 }
200
201 static force_inline __m128i
202 expand_alpha_1x128 (__m128i data)
203 {
204     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
205                                                      _MM_SHUFFLE (3, 3, 3, 3)),
206                                 _MM_SHUFFLE (3, 3, 3, 3));
207 }
208
209 static force_inline void
210 expand_alpha_2x128 (__m128i  data_lo,
211                     __m128i  data_hi,
212                     __m128i* alpha_lo,
213                     __m128i* alpha_hi)
214 {
215     __m128i lo, hi;
216
217     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
218     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
219
220     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
221     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
222 }
223
224 static force_inline void
225 expand_alpha_rev_2x128 (__m128i  data_lo,
226                         __m128i  data_hi,
227                         __m128i* alpha_lo,
228                         __m128i* alpha_hi)
229 {
230     __m128i lo, hi;
231
232     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
233     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
234     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
235     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
236 }
237
238 static force_inline void
239 pix_multiply_2x128 (__m128i* data_lo,
240                     __m128i* data_hi,
241                     __m128i* alpha_lo,
242                     __m128i* alpha_hi,
243                     __m128i* ret_lo,
244                     __m128i* ret_hi)
245 {
246     __m128i lo, hi;
247
248     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
249     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
250     lo = _mm_adds_epu16 (lo, mask_0080);
251     hi = _mm_adds_epu16 (hi, mask_0080);
252     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
253     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
254 }
255
256 static force_inline void
257 pix_add_multiply_2x128 (__m128i* src_lo,
258                         __m128i* src_hi,
259                         __m128i* alpha_dst_lo,
260                         __m128i* alpha_dst_hi,
261                         __m128i* dst_lo,
262                         __m128i* dst_hi,
263                         __m128i* alpha_src_lo,
264                         __m128i* alpha_src_hi,
265                         __m128i* ret_lo,
266                         __m128i* ret_hi)
267 {
268     __m128i t1_lo, t1_hi;
269     __m128i t2_lo, t2_hi;
270
271     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
272     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
273
274     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
275     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
276 }
277
278 static force_inline void
279 negate_2x128 (__m128i  data_lo,
280               __m128i  data_hi,
281               __m128i* neg_lo,
282               __m128i* neg_hi)
283 {
284     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
285     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
286 }
287
288 static force_inline void
289 invert_colors_2x128 (__m128i  data_lo,
290                      __m128i  data_hi,
291                      __m128i* inv_lo,
292                      __m128i* inv_hi)
293 {
294     __m128i lo, hi;
295
296     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
297     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
298     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
299     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
300 }
301
302 static force_inline void
303 over_2x128 (__m128i* src_lo,
304             __m128i* src_hi,
305             __m128i* alpha_lo,
306             __m128i* alpha_hi,
307             __m128i* dst_lo,
308             __m128i* dst_hi)
309 {
310     __m128i t1, t2;
311
312     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
313
314     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
315
316     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
317     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
318 }
319
320 static force_inline void
321 over_rev_non_pre_2x128 (__m128i  src_lo,
322                         __m128i  src_hi,
323                         __m128i* dst_lo,
324                         __m128i* dst_hi)
325 {
326     __m128i lo, hi;
327     __m128i alpha_lo, alpha_hi;
328
329     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
330
331     lo = _mm_or_si128 (alpha_lo, mask_alpha);
332     hi = _mm_or_si128 (alpha_hi, mask_alpha);
333
334     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
335
336     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
337
338     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
339 }
340
341 static force_inline void
342 in_over_2x128 (__m128i* src_lo,
343                __m128i* src_hi,
344                __m128i* alpha_lo,
345                __m128i* alpha_hi,
346                __m128i* mask_lo,
347                __m128i* mask_hi,
348                __m128i* dst_lo,
349                __m128i* dst_hi)
350 {
351     __m128i s_lo, s_hi;
352     __m128i a_lo, a_hi;
353
354     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
355     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
356
357     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
358 }
359
360 static force_inline void
361 cache_prefetch (__m128i* addr)
362 {
363     _mm_prefetch ((void const*)addr, _MM_HINT_T0);
364 }
365
366 static force_inline void
367 cache_prefetch_next (__m128i* addr)
368 {
369     _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
370 }
371
372 /* prefetching NULL is very slow on some systems. don't do that. */
373
374 static force_inline void
375 maybe_prefetch (__m128i* addr)
376 {
377     if (addr)
378         cache_prefetch (addr);
379 }
380
381 static force_inline void
382 maybe_prefetch_next (__m128i* addr)
383 {
384     if (addr)
385         cache_prefetch_next (addr);
386 }
387
388 /* load 4 pixels from a 16-byte boundary aligned address */
389 static force_inline __m128i
390 load_128_aligned (__m128i* src)
391 {
392     return _mm_load_si128 (src);
393 }
394
395 /* load 4 pixels from a unaligned address */
396 static force_inline __m128i
397 load_128_unaligned (const __m128i* src)
398 {
399     return _mm_loadu_si128 (src);
400 }
401
402 /* save 4 pixels using Write Combining memory on a 16-byte
403  * boundary aligned address
404  */
405 static force_inline void
406 save_128_write_combining (__m128i* dst,
407                           __m128i  data)
408 {
409     _mm_stream_si128 (dst, data);
410 }
411
412 /* save 4 pixels on a 16-byte boundary aligned address */
413 static force_inline void
414 save_128_aligned (__m128i* dst,
415                   __m128i  data)
416 {
417     _mm_store_si128 (dst, data);
418 }
419
420 /* save 4 pixels on a unaligned address */
421 static force_inline void
422 save_128_unaligned (__m128i* dst,
423                     __m128i  data)
424 {
425     _mm_storeu_si128 (dst, data);
426 }
427
428 /* ------------------------------------------------------------------
429  * MMX inlines
430  */
431
432 static force_inline __m64
433 load_32_1x64 (uint32_t data)
434 {
435     return _mm_cvtsi32_si64 (data);
436 }
437
438 static force_inline __m64
439 unpack_32_1x64 (uint32_t data)
440 {
441     return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
442 }
443
444 static force_inline __m64
445 expand_alpha_1x64 (__m64 data)
446 {
447     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
448 }
449
450 static force_inline __m64
451 expand_alpha_rev_1x64 (__m64 data)
452 {
453     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
454 }
455
456 static force_inline __m64
457 expand_pixel_8_1x64 (uint8_t data)
458 {
459     return _mm_shuffle_pi16 (
460         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
461 }
462
463 static force_inline __m64
464 pix_multiply_1x64 (__m64 data,
465                    __m64 alpha)
466 {
467     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
468                                           mask_x0080),
469                            mask_x0101);
470 }
471
472 static force_inline __m64
473 pix_add_multiply_1x64 (__m64* src,
474                        __m64* alpha_dst,
475                        __m64* dst,
476                        __m64* alpha_src)
477 {
478     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
479     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
480
481     return _mm_adds_pu8 (t1, t2);
482 }
483
484 static force_inline __m64
485 negate_1x64 (__m64 data)
486 {
487     return _mm_xor_si64 (data, mask_x00ff);
488 }
489
490 static force_inline __m64
491 invert_colors_1x64 (__m64 data)
492 {
493     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
494 }
495
496 static force_inline __m64
497 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
498 {
499     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
500 }
501
502 static force_inline __m64
503 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
504 {
505     return over_1x64 (pix_multiply_1x64 (*src, *mask),
506                       pix_multiply_1x64 (*alpha, *mask),
507                       *dst);
508 }
509
510 static force_inline __m64
511 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
512 {
513     __m64 alpha = expand_alpha_1x64 (src);
514
515     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
516                                          _mm_or_si64 (alpha, mask_x_alpha)),
517                       alpha,
518                       dst);
519 }
520
521 static force_inline uint32_t
522 pack_1x64_32 (__m64 data)
523 {
524     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
525 }
526
527 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
528  *
529  *    00RR00GG00BB
530  *
531  * --- Expanding 565 in the low word ---
532  *
533  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
534  * m = m & (01f0003f001f);
535  * m = m * (008404100840);
536  * m = m >> 8;
537  *
538  * Note the trick here - the top word is shifted by another nibble to
539  * avoid it bumping into the middle word
540  */
541 static force_inline __m64
542 expand565_16_1x64 (uint16_t pixel)
543 {
544     __m64 p;
545     __m64 t1, t2;
546
547     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
548
549     t1 = _mm_slli_si64 (p, 36 - 11);
550     t2 = _mm_slli_si64 (p, 16 - 5);
551
552     p = _mm_or_si64 (t1, p);
553     p = _mm_or_si64 (t2, p);
554     p = _mm_and_si64 (p, mask_x565_rgb);
555     p = _mm_mullo_pi16 (p, mask_x565_unpack);
556
557     return _mm_srli_pi16 (p, 8);
558 }
559
560 /* ----------------------------------------------------------------------------
561  * Compose Core transformations
562  */
563 static force_inline uint32_t
564 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
565 {
566     uint8_t a;
567     __m64 ms;
568
569     a = src >> 24;
570
571     if (a == 0xff)
572     {
573         return src;
574     }
575     else if (src)
576     {
577         ms = unpack_32_1x64 (src);
578         return pack_1x64_32 (
579             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
580     }
581
582     return dst;
583 }
584
585 static force_inline uint32_t
586 combine1 (const uint32_t *ps, const uint32_t *pm)
587 {
588     uint32_t s = *ps;
589
590     if (pm)
591     {
592         __m64 ms, mm;
593
594         mm = unpack_32_1x64 (*pm);
595         mm = expand_alpha_1x64 (mm);
596
597         ms = unpack_32_1x64 (s);
598         ms = pix_multiply_1x64 (ms, mm);
599
600         s = pack_1x64_32 (ms);
601     }
602
603     return s;
604 }
605
606 static force_inline __m128i
607 combine4 (const __m128i *ps, const __m128i *pm)
608 {
609     __m128i xmm_src_lo, xmm_src_hi;
610     __m128i xmm_msk_lo, xmm_msk_hi;
611     __m128i s;
612
613     if (pm)
614     {
615         xmm_msk_lo = load_128_unaligned (pm);
616
617         if (is_transparent (xmm_msk_lo))
618             return _mm_setzero_si128 ();
619     }
620
621     s = load_128_unaligned (ps);
622
623     if (pm)
624     {
625         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
626         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
627
628         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
629
630         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
631                             &xmm_msk_lo, &xmm_msk_hi,
632                             &xmm_src_lo, &xmm_src_hi);
633
634         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
635     }
636
637     return s;
638 }
639
640 static force_inline void
641 core_combine_over_u_sse2 (uint32_t*       pd,
642                           const uint32_t* ps,
643                           const uint32_t* pm,
644                           int             w)
645 {
646     uint32_t s, d;
647
648     __m128i xmm_dst_lo, xmm_dst_hi;
649     __m128i xmm_src_lo, xmm_src_hi;
650     __m128i xmm_alpha_lo, xmm_alpha_hi;
651
652     /* call prefetch hint to optimize cache load*/
653     cache_prefetch ((__m128i*)ps);
654     cache_prefetch ((__m128i*)pd);
655     maybe_prefetch ((__m128i*)pm);
656
657     /* Align dst on a 16-byte boundary */
658     while (w && ((unsigned long)pd & 15))
659     {
660         d = *pd;
661         s = combine1 (ps, pm);
662
663         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
664         ps++;
665         if (pm)
666             pm++;
667         w--;
668     }
669
670     /* call prefetch hint to optimize cache load*/
671     cache_prefetch ((__m128i*)ps);
672     cache_prefetch ((__m128i*)pd);
673     maybe_prefetch ((__m128i*)pm);
674
675     while (w >= 4)
676     {
677         /* fill cache line with next memory */
678         cache_prefetch_next ((__m128i*)ps);
679         cache_prefetch_next ((__m128i*)pd);
680         maybe_prefetch_next ((__m128i*)pm);
681
682         /* I'm loading unaligned because I'm not sure about
683          * the address alignment.
684          */
685         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
686
687         if (is_opaque (xmm_src_hi))
688         {
689             save_128_aligned ((__m128i*)pd, xmm_src_hi);
690         }
691         else if (!is_zero (xmm_src_hi))
692         {
693             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
694
695             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
696             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
697
698             expand_alpha_2x128 (
699                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
700
701             over_2x128 (&xmm_src_lo, &xmm_src_hi,
702                         &xmm_alpha_lo, &xmm_alpha_hi,
703                         &xmm_dst_lo, &xmm_dst_hi);
704
705             /* rebuid the 4 pixel data and save*/
706             save_128_aligned ((__m128i*)pd,
707                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
708         }
709
710         w -= 4;
711         ps += 4;
712         pd += 4;
713         if (pm)
714             pm += 4;
715     }
716
717     while (w)
718     {
719         d = *pd;
720         s = combine1 (ps, pm);
721
722         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
723         ps++;
724         if (pm)
725             pm++;
726
727         w--;
728     }
729 }
730
731 static force_inline void
732 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
733                                   const uint32_t* ps,
734                                   const uint32_t* pm,
735                                   int             w)
736 {
737     uint32_t s, d;
738
739     __m128i xmm_dst_lo, xmm_dst_hi;
740     __m128i xmm_src_lo, xmm_src_hi;
741     __m128i xmm_alpha_lo, xmm_alpha_hi;
742
743     /* call prefetch hint to optimize cache load*/
744     cache_prefetch ((__m128i*)ps);
745     cache_prefetch ((__m128i*)pd);
746     maybe_prefetch ((__m128i*)pm);
747
748     /* Align dst on a 16-byte boundary */
749     while (w &&
750            ((unsigned long)pd & 15))
751     {
752         d = *pd;
753         s = combine1 (ps, pm);
754
755         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
756         w--;
757         ps++;
758         if (pm)
759             pm++;
760     }
761
762     /* call prefetch hint to optimize cache load*/
763     cache_prefetch ((__m128i*)ps);
764     cache_prefetch ((__m128i*)pd);
765     maybe_prefetch ((__m128i*)pm);
766
767     while (w >= 4)
768     {
769         /* fill cache line with next memory */
770         cache_prefetch_next ((__m128i*)ps);
771         cache_prefetch_next ((__m128i*)pd);
772         maybe_prefetch_next ((__m128i*)pm);
773
774         /* I'm loading unaligned because I'm not sure
775          * about the address alignment.
776          */
777         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
778         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
779
780         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
781         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
782
783         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
784                             &xmm_alpha_lo, &xmm_alpha_hi);
785
786         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
787                     &xmm_alpha_lo, &xmm_alpha_hi,
788                     &xmm_src_lo, &xmm_src_hi);
789
790         /* rebuid the 4 pixel data and save*/
791         save_128_aligned ((__m128i*)pd,
792                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
793
794         w -= 4;
795         ps += 4;
796         pd += 4;
797
798         if (pm)
799             pm += 4;
800     }
801
802     while (w)
803     {
804         d = *pd;
805         s = combine1 (ps, pm);
806
807         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
808         ps++;
809         w--;
810         if (pm)
811             pm++;
812     }
813 }
814
815 static force_inline uint32_t
816 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
817 {
818     uint32_t maska = src >> 24;
819
820     if (maska == 0)
821     {
822         return 0;
823     }
824     else if (maska != 0xff)
825     {
826         return pack_1x64_32 (
827             pix_multiply_1x64 (unpack_32_1x64 (dst),
828                                expand_alpha_1x64 (unpack_32_1x64 (src))));
829     }
830
831     return dst;
832 }
833
834 static force_inline void
835 core_combine_in_u_sse2 (uint32_t*       pd,
836                         const uint32_t* ps,
837                         const uint32_t* pm,
838                         int             w)
839 {
840     uint32_t s, d;
841
842     __m128i xmm_src_lo, xmm_src_hi;
843     __m128i xmm_dst_lo, xmm_dst_hi;
844
845     /* call prefetch hint to optimize cache load*/
846     cache_prefetch ((__m128i*)ps);
847     cache_prefetch ((__m128i*)pd);
848     maybe_prefetch ((__m128i*)pm);
849
850     while (w && ((unsigned long) pd & 15))
851     {
852         s = combine1 (ps, pm);
853         d = *pd;
854
855         *pd++ = core_combine_in_u_pixelsse2 (d, s);
856         w--;
857         ps++;
858         if (pm)
859             pm++;
860     }
861
862     /* call prefetch hint to optimize cache load*/
863     cache_prefetch ((__m128i*)ps);
864     cache_prefetch ((__m128i*)pd);
865     maybe_prefetch ((__m128i*)pm);
866
867     while (w >= 4)
868     {
869         /* fill cache line with next memory */
870         cache_prefetch_next ((__m128i*)ps);
871         cache_prefetch_next ((__m128i*)pd);
872         maybe_prefetch_next ((__m128i*)pm);
873
874         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
875         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
876
877         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
878         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
879
880         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
881         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
882                             &xmm_dst_lo, &xmm_dst_hi,
883                             &xmm_dst_lo, &xmm_dst_hi);
884
885         save_128_aligned ((__m128i*)pd,
886                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
887
888         ps += 4;
889         pd += 4;
890         w -= 4;
891         if (pm)
892             pm += 4;
893     }
894
895     while (w)
896     {
897         s = combine1 (ps, pm);
898         d = *pd;
899
900         *pd++ = core_combine_in_u_pixelsse2 (d, s);
901         w--;
902         ps++;
903         if (pm)
904             pm++;
905     }
906 }
907
908 static force_inline void
909 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
910                                 const uint32_t* ps,
911                                 const uint32_t *pm,
912                                 int             w)
913 {
914     uint32_t s, d;
915
916     __m128i xmm_src_lo, xmm_src_hi;
917     __m128i xmm_dst_lo, xmm_dst_hi;
918
919     /* call prefetch hint to optimize cache load*/
920     cache_prefetch ((__m128i*)ps);
921     cache_prefetch ((__m128i*)pd);
922     maybe_prefetch ((__m128i*)pm);
923
924     while (w && ((unsigned long) pd & 15))
925     {
926         s = combine1 (ps, pm);
927         d = *pd;
928
929         *pd++ = core_combine_in_u_pixelsse2 (s, d);
930         ps++;
931         w--;
932         if (pm)
933             pm++;
934     }
935
936     /* call prefetch hint to optimize cache load*/
937     cache_prefetch ((__m128i*)ps);
938     cache_prefetch ((__m128i*)pd);
939     maybe_prefetch ((__m128i*)pm);
940
941     while (w >= 4)
942     {
943         /* fill cache line with next memory */
944         cache_prefetch_next ((__m128i*)ps);
945         cache_prefetch_next ((__m128i*)pd);
946         maybe_prefetch_next ((__m128i*)pm);
947
948         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
949         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
950
951         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
952         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
953
954         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
955         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
956                             &xmm_src_lo, &xmm_src_hi,
957                             &xmm_dst_lo, &xmm_dst_hi);
958
959         save_128_aligned (
960             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
961
962         ps += 4;
963         pd += 4;
964         w -= 4;
965         if (pm)
966             pm += 4;
967     }
968
969     while (w)
970     {
971         s = combine1 (ps, pm);
972         d = *pd;
973
974         *pd++ = core_combine_in_u_pixelsse2 (s, d);
975         w--;
976         ps++;
977         if (pm)
978             pm++;
979     }
980 }
981
982 static force_inline void
983 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
984                                  const uint32_t* ps,
985                                  const uint32_t* pm,
986                                  int             w)
987 {
988     /* call prefetch hint to optimize cache load*/
989     cache_prefetch ((__m128i*)ps);
990     cache_prefetch ((__m128i*)pd);
991     maybe_prefetch ((__m128i*)pm);
992
993     while (w && ((unsigned long) pd & 15))
994     {
995         uint32_t s = combine1 (ps, pm);
996         uint32_t d = *pd;
997
998         *pd++ = pack_1x64_32 (
999             pix_multiply_1x64 (
1000                 unpack_32_1x64 (d), negate_1x64 (
1001                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1002
1003         if (pm)
1004             pm++;
1005         ps++;
1006         w--;
1007     }
1008
1009     /* call prefetch hint to optimize cache load*/
1010     cache_prefetch ((__m128i*)ps);
1011     cache_prefetch ((__m128i*)pd);
1012     maybe_prefetch ((__m128i*)pm);
1013
1014     while (w >= 4)
1015     {
1016         __m128i xmm_src_lo, xmm_src_hi;
1017         __m128i xmm_dst_lo, xmm_dst_hi;
1018
1019         /* fill cache line with next memory */
1020         cache_prefetch_next ((__m128i*)ps);
1021         cache_prefetch_next ((__m128i*)pd);
1022         maybe_prefetch_next ((__m128i*)pm);
1023
1024         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1025         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1026
1027         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1028         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1029
1030         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1031         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1032
1033         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1034                             &xmm_src_lo, &xmm_src_hi,
1035                             &xmm_dst_lo, &xmm_dst_hi);
1036
1037         save_128_aligned (
1038             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1039
1040         ps += 4;
1041         pd += 4;
1042         if (pm)
1043             pm += 4;
1044
1045         w -= 4;
1046     }
1047
1048     while (w)
1049     {
1050         uint32_t s = combine1 (ps, pm);
1051         uint32_t d = *pd;
1052
1053         *pd++ = pack_1x64_32 (
1054             pix_multiply_1x64 (
1055                 unpack_32_1x64 (d), negate_1x64 (
1056                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1057         ps++;
1058         if (pm)
1059             pm++;
1060         w--;
1061     }
1062 }
1063
1064 static force_inline void
1065 core_combine_out_u_sse2 (uint32_t*       pd,
1066                          const uint32_t* ps,
1067                          const uint32_t* pm,
1068                          int             w)
1069 {
1070     /* call prefetch hint to optimize cache load*/
1071     cache_prefetch ((__m128i*)ps);
1072     cache_prefetch ((__m128i*)pd);
1073     maybe_prefetch ((__m128i*)pm);
1074
1075     while (w && ((unsigned long) pd & 15))
1076     {
1077         uint32_t s = combine1 (ps, pm);
1078         uint32_t d = *pd;
1079
1080         *pd++ = pack_1x64_32 (
1081             pix_multiply_1x64 (
1082                 unpack_32_1x64 (s), negate_1x64 (
1083                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1084         w--;
1085         ps++;
1086         if (pm)
1087             pm++;
1088     }
1089
1090     /* call prefetch hint to optimize cache load*/
1091     cache_prefetch ((__m128i*)ps);
1092     cache_prefetch ((__m128i*)pd);
1093     maybe_prefetch ((__m128i*)pm);
1094
1095     while (w >= 4)
1096     {
1097         __m128i xmm_src_lo, xmm_src_hi;
1098         __m128i xmm_dst_lo, xmm_dst_hi;
1099
1100         /* fill cache line with next memory */
1101         cache_prefetch_next ((__m128i*)ps);
1102         cache_prefetch_next ((__m128i*)pd);
1103         maybe_prefetch_next ((__m128i*)pm);
1104
1105         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1106         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1107
1108         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1109         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1110
1111         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1112         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1113
1114         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1115                             &xmm_dst_lo, &xmm_dst_hi,
1116                             &xmm_dst_lo, &xmm_dst_hi);
1117
1118         save_128_aligned (
1119             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1120
1121         ps += 4;
1122         pd += 4;
1123         w -= 4;
1124         if (pm)
1125             pm += 4;
1126     }
1127
1128     while (w)
1129     {
1130         uint32_t s = combine1 (ps, pm);
1131         uint32_t d = *pd;
1132
1133         *pd++ = pack_1x64_32 (
1134             pix_multiply_1x64 (
1135                 unpack_32_1x64 (s), negate_1x64 (
1136                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1137         w--;
1138         ps++;
1139         if (pm)
1140             pm++;
1141     }
1142 }
1143
1144 static force_inline uint32_t
1145 core_combine_atop_u_pixel_sse2 (uint32_t src,
1146                                 uint32_t dst)
1147 {
1148     __m64 s = unpack_32_1x64 (src);
1149     __m64 d = unpack_32_1x64 (dst);
1150
1151     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1152     __m64 da = expand_alpha_1x64 (d);
1153
1154     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1155 }
1156
1157 static force_inline void
1158 core_combine_atop_u_sse2 (uint32_t*       pd,
1159                           const uint32_t* ps,
1160                           const uint32_t* pm,
1161                           int             w)
1162 {
1163     uint32_t s, d;
1164
1165     __m128i xmm_src_lo, xmm_src_hi;
1166     __m128i xmm_dst_lo, xmm_dst_hi;
1167     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1168     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1169
1170     /* call prefetch hint to optimize cache load*/
1171     cache_prefetch ((__m128i*)ps);
1172     cache_prefetch ((__m128i*)pd);
1173     maybe_prefetch ((__m128i*)pm);
1174
1175     while (w && ((unsigned long) pd & 15))
1176     {
1177         s = combine1 (ps, pm);
1178         d = *pd;
1179
1180         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1181         w--;
1182         ps++;
1183         if (pm)
1184             pm++;
1185     }
1186
1187     /* call prefetch hint to optimize cache load*/
1188     cache_prefetch ((__m128i*)ps);
1189     cache_prefetch ((__m128i*)pd);
1190     maybe_prefetch ((__m128i*)pm);
1191
1192     while (w >= 4)
1193     {
1194         /* fill cache line with next memory */
1195         cache_prefetch_next ((__m128i*)ps);
1196         cache_prefetch_next ((__m128i*)pd);
1197         maybe_prefetch_next ((__m128i*)pm);
1198
1199         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1200         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1201
1202         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1203         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1204
1205         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1206                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1207         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1208                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1209
1210         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1211                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1212
1213         pix_add_multiply_2x128 (
1214             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1215             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1216             &xmm_dst_lo, &xmm_dst_hi);
1217
1218         save_128_aligned (
1219             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1220
1221         ps += 4;
1222         pd += 4;
1223         w -= 4;
1224         if (pm)
1225             pm += 4;
1226     }
1227
1228     while (w)
1229     {
1230         s = combine1 (ps, pm);
1231         d = *pd;
1232
1233         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1234         w--;
1235         ps++;
1236         if (pm)
1237             pm++;
1238     }
1239 }
1240
1241 static force_inline uint32_t
1242 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1243                                         uint32_t dst)
1244 {
1245     __m64 s = unpack_32_1x64 (src);
1246     __m64 d = unpack_32_1x64 (dst);
1247
1248     __m64 sa = expand_alpha_1x64 (s);
1249     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1250
1251     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1252 }
1253
1254 static force_inline void
1255 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1256                                   const uint32_t* ps,
1257                                   const uint32_t* pm,
1258                                   int             w)
1259 {
1260     uint32_t s, d;
1261
1262     __m128i xmm_src_lo, xmm_src_hi;
1263     __m128i xmm_dst_lo, xmm_dst_hi;
1264     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1265     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1266
1267     /* call prefetch hint to optimize cache load*/
1268     cache_prefetch ((__m128i*)ps);
1269     cache_prefetch ((__m128i*)pd);
1270     maybe_prefetch ((__m128i*)pm);
1271
1272     while (w && ((unsigned long) pd & 15))
1273     {
1274         s = combine1 (ps, pm);
1275         d = *pd;
1276
1277         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1278         ps++;
1279         w--;
1280         if (pm)
1281             pm++;
1282     }
1283
1284     /* call prefetch hint to optimize cache load*/
1285     cache_prefetch ((__m128i*)ps);
1286     cache_prefetch ((__m128i*)pd);
1287     maybe_prefetch ((__m128i*)pm);
1288
1289     while (w >= 4)
1290     {
1291         /* fill cache line with next memory */
1292         cache_prefetch_next ((__m128i*)ps);
1293         cache_prefetch_next ((__m128i*)pd);
1294         maybe_prefetch_next ((__m128i*)pm);
1295
1296         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1297         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1298
1299         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1300         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1301
1302         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1303                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1304         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1305                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1306
1307         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1308                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1309
1310         pix_add_multiply_2x128 (
1311             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1312             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1313             &xmm_dst_lo, &xmm_dst_hi);
1314
1315         save_128_aligned (
1316             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1317
1318         ps += 4;
1319         pd += 4;
1320         w -= 4;
1321         if (pm)
1322             pm += 4;
1323     }
1324
1325     while (w)
1326     {
1327         s = combine1 (ps, pm);
1328         d = *pd;
1329
1330         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1331         ps++;
1332         w--;
1333         if (pm)
1334             pm++;
1335     }
1336 }
1337
1338 static force_inline uint32_t
1339 core_combine_xor_u_pixel_sse2 (uint32_t src,
1340                                uint32_t dst)
1341 {
1342     __m64 s = unpack_32_1x64 (src);
1343     __m64 d = unpack_32_1x64 (dst);
1344
1345     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1346     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1347
1348     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1349 }
1350
1351 static force_inline void
1352 core_combine_xor_u_sse2 (uint32_t*       dst,
1353                          const uint32_t* src,
1354                          const uint32_t *mask,
1355                          int             width)
1356 {
1357     int w = width;
1358     uint32_t s, d;
1359     uint32_t* pd = dst;
1360     const uint32_t* ps = src;
1361     const uint32_t* pm = mask;
1362
1363     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1364     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1365     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1366     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1367
1368     /* call prefetch hint to optimize cache load*/
1369     cache_prefetch ((__m128i*)ps);
1370     cache_prefetch ((__m128i*)pd);
1371     maybe_prefetch ((__m128i*)pm);
1372
1373     while (w && ((unsigned long) pd & 15))
1374     {
1375         s = combine1 (ps, pm);
1376         d = *pd;
1377
1378         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1379         w--;
1380         ps++;
1381         if (pm)
1382             pm++;
1383     }
1384
1385     /* call prefetch hint to optimize cache load*/
1386     cache_prefetch ((__m128i*)ps);
1387     cache_prefetch ((__m128i*)pd);
1388     maybe_prefetch ((__m128i*)pm);
1389
1390     while (w >= 4)
1391     {
1392         /* fill cache line with next memory */
1393         cache_prefetch_next ((__m128i*)ps);
1394         cache_prefetch_next ((__m128i*)pd);
1395         maybe_prefetch_next ((__m128i*)pm);
1396
1397         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1398         xmm_dst = load_128_aligned ((__m128i*) pd);
1399
1400         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1401         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1402
1403         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1404                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1405         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1406                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1407
1408         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1409                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1410         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1411                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1412
1413         pix_add_multiply_2x128 (
1414             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1415             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1416             &xmm_dst_lo, &xmm_dst_hi);
1417
1418         save_128_aligned (
1419             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1420
1421         ps += 4;
1422         pd += 4;
1423         w -= 4;
1424         if (pm)
1425             pm += 4;
1426     }
1427
1428     while (w)
1429     {
1430         s = combine1 (ps, pm);
1431         d = *pd;
1432
1433         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1434         w--;
1435         ps++;
1436         if (pm)
1437             pm++;
1438     }
1439 }
1440
1441 static force_inline void
1442 core_combine_add_u_sse2 (uint32_t*       dst,
1443                          const uint32_t* src,
1444                          const uint32_t* mask,
1445                          int             width)
1446 {
1447     int w = width;
1448     uint32_t s, d;
1449     uint32_t* pd = dst;
1450     const uint32_t* ps = src;
1451     const uint32_t* pm = mask;
1452
1453     /* call prefetch hint to optimize cache load*/
1454     cache_prefetch ((__m128i*)ps);
1455     cache_prefetch ((__m128i*)pd);
1456     maybe_prefetch ((__m128i*)pm);
1457
1458     while (w && (unsigned long)pd & 15)
1459     {
1460         s = combine1 (ps, pm);
1461         d = *pd;
1462
1463         ps++;
1464         if (pm)
1465             pm++;
1466         *pd++ = _mm_cvtsi64_si32 (
1467             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1468         w--;
1469     }
1470
1471     /* call prefetch hint to optimize cache load*/
1472     cache_prefetch ((__m128i*)ps);
1473     cache_prefetch ((__m128i*)pd);
1474     maybe_prefetch ((__m128i*)pm);
1475
1476     while (w >= 4)
1477     {
1478         __m128i s;
1479
1480         /* fill cache line with next memory */
1481         cache_prefetch_next ((__m128i*)ps);
1482         cache_prefetch_next ((__m128i*)pd);
1483         maybe_prefetch_next ((__m128i*)pm);
1484
1485         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1486
1487         save_128_aligned (
1488             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1489
1490         pd += 4;
1491         ps += 4;
1492         if (pm)
1493             pm += 4;
1494         w -= 4;
1495     }
1496
1497     while (w--)
1498     {
1499         s = combine1 (ps, pm);
1500         d = *pd;
1501
1502         ps++;
1503         *pd++ = _mm_cvtsi64_si32 (
1504             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1505         if (pm)
1506             pm++;
1507     }
1508 }
1509
1510 static force_inline uint32_t
1511 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1512                                     uint32_t dst)
1513 {
1514     __m64 ms = unpack_32_1x64 (src);
1515     __m64 md = unpack_32_1x64 (dst);
1516     uint32_t sa = src >> 24;
1517     uint32_t da = ~dst >> 24;
1518
1519     if (sa > da)
1520     {
1521         ms = pix_multiply_1x64 (
1522             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1523     }
1524
1525     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1526 }
1527
1528 static force_inline void
1529 core_combine_saturate_u_sse2 (uint32_t *      pd,
1530                               const uint32_t *ps,
1531                               const uint32_t *pm,
1532                               int             w)
1533 {
1534     uint32_t s, d;
1535
1536     uint32_t pack_cmp;
1537     __m128i xmm_src, xmm_dst;
1538
1539     /* call prefetch hint to optimize cache load*/
1540     cache_prefetch ((__m128i*)ps);
1541     cache_prefetch ((__m128i*)pd);
1542     maybe_prefetch ((__m128i*)pm);
1543
1544     while (w && (unsigned long)pd & 15)
1545     {
1546         s = combine1 (ps, pm);
1547         d = *pd;
1548
1549         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1550         w--;
1551         ps++;
1552         if (pm)
1553             pm++;
1554     }
1555
1556     /* call prefetch hint to optimize cache load*/
1557     cache_prefetch ((__m128i*)ps);
1558     cache_prefetch ((__m128i*)pd);
1559     maybe_prefetch ((__m128i*)pm);
1560
1561     while (w >= 4)
1562     {
1563         /* fill cache line with next memory */
1564         cache_prefetch_next ((__m128i*)ps);
1565         cache_prefetch_next ((__m128i*)pd);
1566         maybe_prefetch_next ((__m128i*)pm);
1567
1568         xmm_dst = load_128_aligned  ((__m128i*)pd);
1569         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1570
1571         pack_cmp = _mm_movemask_epi8 (
1572             _mm_cmpgt_epi32 (
1573                 _mm_srli_epi32 (xmm_src, 24),
1574                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1575
1576         /* if some alpha src is grater than respective ~alpha dst */
1577         if (pack_cmp)
1578         {
1579             s = combine1 (ps++, pm);
1580             d = *pd;
1581             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1582             if (pm)
1583                 pm++;
1584
1585             s = combine1 (ps++, pm);
1586             d = *pd;
1587             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1588             if (pm)
1589                 pm++;
1590
1591             s = combine1 (ps++, pm);
1592             d = *pd;
1593             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1594             if (pm)
1595                 pm++;
1596
1597             s = combine1 (ps++, pm);
1598             d = *pd;
1599             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1600             if (pm)
1601                 pm++;
1602         }
1603         else
1604         {
1605             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1606
1607             pd += 4;
1608             ps += 4;
1609             if (pm)
1610                 pm += 4;
1611         }
1612
1613         w -= 4;
1614     }
1615
1616     while (w--)
1617     {
1618         s = combine1 (ps, pm);
1619         d = *pd;
1620
1621         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1622         ps++;
1623         if (pm)
1624             pm++;
1625     }
1626 }
1627
1628 static force_inline void
1629 core_combine_src_ca_sse2 (uint32_t*       pd,
1630                           const uint32_t* ps,
1631                           const uint32_t *pm,
1632                           int             w)
1633 {
1634     uint32_t s, m;
1635
1636     __m128i xmm_src_lo, xmm_src_hi;
1637     __m128i xmm_mask_lo, xmm_mask_hi;
1638     __m128i xmm_dst_lo, xmm_dst_hi;
1639
1640     /* call prefetch hint to optimize cache load*/
1641     cache_prefetch ((__m128i*)ps);
1642     cache_prefetch ((__m128i*)pd);
1643     cache_prefetch ((__m128i*)pm);
1644
1645     while (w && (unsigned long)pd & 15)
1646     {
1647         s = *ps++;
1648         m = *pm++;
1649         *pd++ = pack_1x64_32 (
1650             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1651         w--;
1652     }
1653
1654     /* call prefetch hint to optimize cache load*/
1655     cache_prefetch ((__m128i*)ps);
1656     cache_prefetch ((__m128i*)pd);
1657     cache_prefetch ((__m128i*)pm);
1658
1659     while (w >= 4)
1660     {
1661         /* fill cache line with next memory */
1662         cache_prefetch_next ((__m128i*)ps);
1663         cache_prefetch_next ((__m128i*)pd);
1664         cache_prefetch_next ((__m128i*)pm);
1665
1666         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1667         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1668
1669         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1670         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1671
1672         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1673                             &xmm_mask_lo, &xmm_mask_hi,
1674                             &xmm_dst_lo, &xmm_dst_hi);
1675
1676         save_128_aligned (
1677             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1678
1679         ps += 4;
1680         pd += 4;
1681         pm += 4;
1682         w -= 4;
1683     }
1684
1685     while (w)
1686     {
1687         s = *ps++;
1688         m = *pm++;
1689         *pd++ = pack_1x64_32 (
1690             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1691         w--;
1692     }
1693 }
1694
1695 static force_inline uint32_t
1696 core_combine_over_ca_pixel_sse2 (uint32_t src,
1697                                  uint32_t mask,
1698                                  uint32_t dst)
1699 {
1700     __m64 s = unpack_32_1x64 (src);
1701     __m64 expAlpha = expand_alpha_1x64 (s);
1702     __m64 unpk_mask = unpack_32_1x64 (mask);
1703     __m64 unpk_dst  = unpack_32_1x64 (dst);
1704
1705     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1706 }
1707
1708 static force_inline void
1709 core_combine_over_ca_sse2 (uint32_t*       pd,
1710                            const uint32_t* ps,
1711                            const uint32_t *pm,
1712                            int             w)
1713 {
1714     uint32_t s, m, d;
1715
1716     __m128i xmm_alpha_lo, xmm_alpha_hi;
1717     __m128i xmm_src_lo, xmm_src_hi;
1718     __m128i xmm_dst_lo, xmm_dst_hi;
1719     __m128i xmm_mask_lo, xmm_mask_hi;
1720
1721     /* call prefetch hint to optimize cache load*/
1722     cache_prefetch ((__m128i*)ps);
1723     cache_prefetch ((__m128i*)pd);
1724     cache_prefetch ((__m128i*)pm);
1725
1726     while (w && (unsigned long)pd & 15)
1727     {
1728         s = *ps++;
1729         m = *pm++;
1730         d = *pd;
1731
1732         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1733         w--;
1734     }
1735
1736     /* call prefetch hint to optimize cache load*/
1737     cache_prefetch ((__m128i*)ps);
1738     cache_prefetch ((__m128i*)pd);
1739     cache_prefetch ((__m128i*)pm);
1740
1741     while (w >= 4)
1742     {
1743         /* fill cache line with next memory */
1744         cache_prefetch_next ((__m128i*)ps);
1745         cache_prefetch_next ((__m128i*)pd);
1746         cache_prefetch_next ((__m128i*)pm);
1747
1748         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1749         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1750         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1751
1752         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1753         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1754         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1755
1756         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1757                             &xmm_alpha_lo, &xmm_alpha_hi);
1758
1759         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1760                        &xmm_alpha_lo, &xmm_alpha_hi,
1761                        &xmm_mask_lo, &xmm_mask_hi,
1762                        &xmm_dst_lo, &xmm_dst_hi);
1763
1764         save_128_aligned (
1765             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1766
1767         ps += 4;
1768         pd += 4;
1769         pm += 4;
1770         w -= 4;
1771     }
1772
1773     while (w)
1774     {
1775         s = *ps++;
1776         m = *pm++;
1777         d = *pd;
1778
1779         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1780         w--;
1781     }
1782 }
1783
1784 static force_inline uint32_t
1785 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1786                                          uint32_t mask,
1787                                          uint32_t dst)
1788 {
1789     __m64 d = unpack_32_1x64 (dst);
1790
1791     return pack_1x64_32 (
1792         over_1x64 (d, expand_alpha_1x64 (d),
1793                    pix_multiply_1x64 (unpack_32_1x64 (src),
1794                                       unpack_32_1x64 (mask))));
1795 }
1796
1797 static force_inline void
1798 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1799                                    const uint32_t* ps,
1800                                    const uint32_t *pm,
1801                                    int             w)
1802 {
1803     uint32_t s, m, d;
1804
1805     __m128i xmm_alpha_lo, xmm_alpha_hi;
1806     __m128i xmm_src_lo, xmm_src_hi;
1807     __m128i xmm_dst_lo, xmm_dst_hi;
1808     __m128i xmm_mask_lo, xmm_mask_hi;
1809
1810     /* call prefetch hint to optimize cache load*/
1811     cache_prefetch ((__m128i*)ps);
1812     cache_prefetch ((__m128i*)pd);
1813     cache_prefetch ((__m128i*)pm);
1814
1815     while (w && (unsigned long)pd & 15)
1816     {
1817         s = *ps++;
1818         m = *pm++;
1819         d = *pd;
1820
1821         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1822         w--;
1823     }
1824
1825     /* call prefetch hint to optimize cache load*/
1826     cache_prefetch ((__m128i*)ps);
1827     cache_prefetch ((__m128i*)pd);
1828     cache_prefetch ((__m128i*)pm);
1829
1830     while (w >= 4)
1831     {
1832         /* fill cache line with next memory */
1833         cache_prefetch_next ((__m128i*)ps);
1834         cache_prefetch_next ((__m128i*)pd);
1835         cache_prefetch_next ((__m128i*)pm);
1836
1837         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1838         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1839         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1840
1841         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1842         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1843         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1844
1845         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1846                             &xmm_alpha_lo, &xmm_alpha_hi);
1847         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1848                             &xmm_mask_lo, &xmm_mask_hi,
1849                             &xmm_mask_lo, &xmm_mask_hi);
1850
1851         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1852                     &xmm_alpha_lo, &xmm_alpha_hi,
1853                     &xmm_mask_lo, &xmm_mask_hi);
1854
1855         save_128_aligned (
1856             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1857
1858         ps += 4;
1859         pd += 4;
1860         pm += 4;
1861         w -= 4;
1862     }
1863
1864     while (w)
1865     {
1866         s = *ps++;
1867         m = *pm++;
1868         d = *pd;
1869
1870         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1871         w--;
1872     }
1873 }
1874
1875 static force_inline void
1876 core_combine_in_ca_sse2 (uint32_t *      pd,
1877                          const uint32_t *ps,
1878                          const uint32_t *pm,
1879                          int             w)
1880 {
1881     uint32_t s, m, d;
1882
1883     __m128i xmm_alpha_lo, xmm_alpha_hi;
1884     __m128i xmm_src_lo, xmm_src_hi;
1885     __m128i xmm_dst_lo, xmm_dst_hi;
1886     __m128i xmm_mask_lo, xmm_mask_hi;
1887
1888     /* call prefetch hint to optimize cache load*/
1889     cache_prefetch ((__m128i*)ps);
1890     cache_prefetch ((__m128i*)pd);
1891     cache_prefetch ((__m128i*)pm);
1892
1893     while (w && (unsigned long)pd & 15)
1894     {
1895         s = *ps++;
1896         m = *pm++;
1897         d = *pd;
1898
1899         *pd++ = pack_1x64_32 (
1900             pix_multiply_1x64 (
1901                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1902                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1903
1904         w--;
1905     }
1906
1907     /* call prefetch hint to optimize cache load*/
1908     cache_prefetch ((__m128i*)ps);
1909     cache_prefetch ((__m128i*)pd);
1910     cache_prefetch ((__m128i*)pm);
1911
1912     while (w >= 4)
1913     {
1914         /* fill cache line with next memory */
1915         cache_prefetch_next ((__m128i*)ps);
1916         cache_prefetch_next ((__m128i*)pd);
1917         cache_prefetch_next ((__m128i*)pm);
1918
1919         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1920         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1921         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1922
1923         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1924         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1925         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1926
1927         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1928                             &xmm_alpha_lo, &xmm_alpha_hi);
1929
1930         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1931                             &xmm_mask_lo, &xmm_mask_hi,
1932                             &xmm_dst_lo, &xmm_dst_hi);
1933
1934         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1935                             &xmm_alpha_lo, &xmm_alpha_hi,
1936                             &xmm_dst_lo, &xmm_dst_hi);
1937
1938         save_128_aligned (
1939             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1940
1941         ps += 4;
1942         pd += 4;
1943         pm += 4;
1944         w -= 4;
1945     }
1946
1947     while (w)
1948     {
1949         s = *ps++;
1950         m = *pm++;
1951         d = *pd;
1952
1953         *pd++ = pack_1x64_32 (
1954             pix_multiply_1x64 (
1955                 pix_multiply_1x64 (
1956                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1957                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1958
1959         w--;
1960     }
1961 }
1962
1963 static force_inline void
1964 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1965                                  const uint32_t *ps,
1966                                  const uint32_t *pm,
1967                                  int             w)
1968 {
1969     uint32_t s, m, d;
1970
1971     __m128i xmm_alpha_lo, xmm_alpha_hi;
1972     __m128i xmm_src_lo, xmm_src_hi;
1973     __m128i xmm_dst_lo, xmm_dst_hi;
1974     __m128i xmm_mask_lo, xmm_mask_hi;
1975
1976     /* call prefetch hint to optimize cache load*/
1977     cache_prefetch ((__m128i*)ps);
1978     cache_prefetch ((__m128i*)pd);
1979     cache_prefetch ((__m128i*)pm);
1980
1981     while (w && (unsigned long)pd & 15)
1982     {
1983         s = *ps++;
1984         m = *pm++;
1985         d = *pd;
1986
1987         *pd++ = pack_1x64_32 (
1988             pix_multiply_1x64 (
1989                 unpack_32_1x64 (d),
1990                 pix_multiply_1x64 (unpack_32_1x64 (m),
1991                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1992         w--;
1993     }
1994
1995     /* call prefetch hint to optimize cache load*/
1996     cache_prefetch ((__m128i*)ps);
1997     cache_prefetch ((__m128i*)pd);
1998     cache_prefetch ((__m128i*)pm);
1999
2000     while (w >= 4)
2001     {
2002         /* fill cache line with next memory */
2003         cache_prefetch_next ((__m128i*)ps);
2004         cache_prefetch_next ((__m128i*)pd);
2005         cache_prefetch_next ((__m128i*)pm);
2006
2007         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2008         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2009         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2010
2011         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2012         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2013         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2014
2015         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2016                             &xmm_alpha_lo, &xmm_alpha_hi);
2017         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2018                             &xmm_alpha_lo, &xmm_alpha_hi,
2019                             &xmm_alpha_lo, &xmm_alpha_hi);
2020
2021         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2022                             &xmm_alpha_lo, &xmm_alpha_hi,
2023                             &xmm_dst_lo, &xmm_dst_hi);
2024
2025         save_128_aligned (
2026             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2027
2028         ps += 4;
2029         pd += 4;
2030         pm += 4;
2031         w -= 4;
2032     }
2033
2034     while (w)
2035     {
2036         s = *ps++;
2037         m = *pm++;
2038         d = *pd;
2039
2040         *pd++ = pack_1x64_32 (
2041             pix_multiply_1x64 (
2042                 unpack_32_1x64 (d),
2043                 pix_multiply_1x64 (unpack_32_1x64 (m),
2044                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
2045         w--;
2046     }
2047 }
2048
2049 static force_inline void
2050 core_combine_out_ca_sse2 (uint32_t *      pd,
2051                           const uint32_t *ps,
2052                           const uint32_t *pm,
2053                           int             w)
2054 {
2055     uint32_t s, m, d;
2056
2057     __m128i xmm_alpha_lo, xmm_alpha_hi;
2058     __m128i xmm_src_lo, xmm_src_hi;
2059     __m128i xmm_dst_lo, xmm_dst_hi;
2060     __m128i xmm_mask_lo, xmm_mask_hi;
2061
2062     /* call prefetch hint to optimize cache load*/
2063     cache_prefetch ((__m128i*)ps);
2064     cache_prefetch ((__m128i*)pd);
2065     cache_prefetch ((__m128i*)pm);
2066
2067     while (w && (unsigned long)pd & 15)
2068     {
2069         s = *ps++;
2070         m = *pm++;
2071         d = *pd;
2072
2073         *pd++ = pack_1x64_32 (
2074             pix_multiply_1x64 (
2075                 pix_multiply_1x64 (
2076                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2077                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2078         w--;
2079     }
2080
2081     /* call prefetch hint to optimize cache load*/
2082     cache_prefetch ((__m128i*)ps);
2083     cache_prefetch ((__m128i*)pd);
2084     cache_prefetch ((__m128i*)pm);
2085
2086     while (w >= 4)
2087     {
2088         /* fill cache line with next memory */
2089         cache_prefetch_next ((__m128i*)ps);
2090         cache_prefetch_next ((__m128i*)pd);
2091         cache_prefetch_next ((__m128i*)pm);
2092
2093         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2094         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2095         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2096
2097         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2098         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2099         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2100
2101         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2102                             &xmm_alpha_lo, &xmm_alpha_hi);
2103         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2104                       &xmm_alpha_lo, &xmm_alpha_hi);
2105
2106         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2107                             &xmm_mask_lo, &xmm_mask_hi,
2108                             &xmm_dst_lo, &xmm_dst_hi);
2109         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2110                             &xmm_alpha_lo, &xmm_alpha_hi,
2111                             &xmm_dst_lo, &xmm_dst_hi);
2112
2113         save_128_aligned (
2114             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2115
2116         ps += 4;
2117         pd += 4;
2118         pm += 4;
2119         w -= 4;
2120     }
2121
2122     while (w)
2123     {
2124         s = *ps++;
2125         m = *pm++;
2126         d = *pd;
2127
2128         *pd++ = pack_1x64_32 (
2129             pix_multiply_1x64 (
2130                 pix_multiply_1x64 (
2131                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2132                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2133
2134         w--;
2135     }
2136 }
2137
2138 static force_inline void
2139 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
2140                                   const uint32_t *ps,
2141                                   const uint32_t *pm,
2142                                   int             w)
2143 {
2144     uint32_t s, m, d;
2145
2146     __m128i xmm_alpha_lo, xmm_alpha_hi;
2147     __m128i xmm_src_lo, xmm_src_hi;
2148     __m128i xmm_dst_lo, xmm_dst_hi;
2149     __m128i xmm_mask_lo, xmm_mask_hi;
2150
2151     /* call prefetch hint to optimize cache load*/
2152     cache_prefetch ((__m128i*)ps);
2153     cache_prefetch ((__m128i*)pd);
2154     cache_prefetch ((__m128i*)pm);
2155
2156     while (w && (unsigned long)pd & 15)
2157     {
2158         s = *ps++;
2159         m = *pm++;
2160         d = *pd;
2161
2162         *pd++ = pack_1x64_32 (
2163             pix_multiply_1x64 (
2164                 unpack_32_1x64 (d),
2165                 negate_1x64 (pix_multiply_1x64 (
2166                                  unpack_32_1x64 (m),
2167                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2168         w--;
2169     }
2170
2171     /* call prefetch hint to optimize cache load*/
2172     cache_prefetch ((__m128i*)ps);
2173     cache_prefetch ((__m128i*)pd);
2174     cache_prefetch ((__m128i*)pm);
2175
2176     while (w >= 4)
2177     {
2178         /* fill cache line with next memory */
2179         cache_prefetch_next ((__m128i*)ps);
2180         cache_prefetch_next ((__m128i*)pd);
2181         cache_prefetch_next ((__m128i*)pm);
2182
2183         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2184         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2185         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2186
2187         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2188         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2189         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2190
2191         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2192                             &xmm_alpha_lo, &xmm_alpha_hi);
2193
2194         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2195                             &xmm_alpha_lo, &xmm_alpha_hi,
2196                             &xmm_mask_lo, &xmm_mask_hi);
2197
2198         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2199                       &xmm_mask_lo, &xmm_mask_hi);
2200
2201         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2202                             &xmm_mask_lo, &xmm_mask_hi,
2203                             &xmm_dst_lo, &xmm_dst_hi);
2204
2205         save_128_aligned (
2206             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2207
2208         ps += 4;
2209         pd += 4;
2210         pm += 4;
2211         w -= 4;
2212     }
2213
2214     while (w)
2215     {
2216         s = *ps++;
2217         m = *pm++;
2218         d = *pd;
2219
2220         *pd++ = pack_1x64_32 (
2221             pix_multiply_1x64 (
2222                 unpack_32_1x64 (d),
2223                 negate_1x64 (pix_multiply_1x64 (
2224                                  unpack_32_1x64 (m),
2225                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2226         w--;
2227     }
2228 }
2229
2230 static force_inline uint32_t
2231 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2232                                  uint32_t mask,
2233                                  uint32_t dst)
2234 {
2235     __m64 m = unpack_32_1x64 (mask);
2236     __m64 s = unpack_32_1x64 (src);
2237     __m64 d = unpack_32_1x64 (dst);
2238     __m64 sa = expand_alpha_1x64 (s);
2239     __m64 da = expand_alpha_1x64 (d);
2240
2241     s = pix_multiply_1x64 (s, m);
2242     m = negate_1x64 (pix_multiply_1x64 (m, sa));
2243
2244     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2245 }
2246
2247 static force_inline void
2248 core_combine_atop_ca_sse2 (uint32_t *      pd,
2249                            const uint32_t *ps,
2250                            const uint32_t *pm,
2251                            int             w)
2252 {
2253     uint32_t s, m, d;
2254
2255     __m128i xmm_src_lo, xmm_src_hi;
2256     __m128i xmm_dst_lo, xmm_dst_hi;
2257     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2258     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2259     __m128i xmm_mask_lo, xmm_mask_hi;
2260
2261     /* call prefetch hint to optimize cache load*/
2262     cache_prefetch ((__m128i*)ps);
2263     cache_prefetch ((__m128i*)pd);
2264     cache_prefetch ((__m128i*)pm);
2265
2266     while (w && (unsigned long)pd & 15)
2267     {
2268         s = *ps++;
2269         m = *pm++;
2270         d = *pd;
2271
2272         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2273         w--;
2274     }
2275
2276     /* call prefetch hint to optimize cache load*/
2277     cache_prefetch ((__m128i*)ps);
2278     cache_prefetch ((__m128i*)pd);
2279     cache_prefetch ((__m128i*)pm);
2280
2281     while (w >= 4)
2282     {
2283         /* fill cache line with next memory */
2284         cache_prefetch_next ((__m128i*)ps);
2285         cache_prefetch_next ((__m128i*)pd);
2286         cache_prefetch_next ((__m128i*)pm);
2287
2288         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2289         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2290         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2291
2292         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2293         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2294         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2295
2296         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2297                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2298         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2299                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2300
2301         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2302                             &xmm_mask_lo, &xmm_mask_hi,
2303                             &xmm_src_lo, &xmm_src_hi);
2304         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2305                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2306                             &xmm_mask_lo, &xmm_mask_hi);
2307
2308         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2309
2310         pix_add_multiply_2x128 (
2311             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2312             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2313             &xmm_dst_lo, &xmm_dst_hi);
2314
2315         save_128_aligned (
2316             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2317
2318         ps += 4;
2319         pd += 4;
2320         pm += 4;
2321         w -= 4;
2322     }
2323
2324     while (w)
2325     {
2326         s = *ps++;
2327         m = *pm++;
2328         d = *pd;
2329
2330         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2331         w--;
2332     }
2333 }
2334
2335 static force_inline uint32_t
2336 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2337                                          uint32_t mask,
2338                                          uint32_t dst)
2339 {
2340     __m64 m = unpack_32_1x64 (mask);
2341     __m64 s = unpack_32_1x64 (src);
2342     __m64 d = unpack_32_1x64 (dst);
2343
2344     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2345     __m64 sa = expand_alpha_1x64 (s);
2346
2347     s = pix_multiply_1x64 (s, m);
2348     m = pix_multiply_1x64 (m, sa);
2349
2350     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2351 }
2352
2353 static force_inline void
2354 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2355                                    const uint32_t *ps,
2356                                    const uint32_t *pm,
2357                                    int             w)
2358 {
2359     uint32_t s, m, d;
2360
2361     __m128i xmm_src_lo, xmm_src_hi;
2362     __m128i xmm_dst_lo, xmm_dst_hi;
2363     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2364     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2365     __m128i xmm_mask_lo, xmm_mask_hi;
2366
2367     /* call prefetch hint to optimize cache load*/
2368     cache_prefetch ((__m128i*)ps);
2369     cache_prefetch ((__m128i*)pd);
2370     cache_prefetch ((__m128i*)pm);
2371
2372     while (w && (unsigned long)pd & 15)
2373     {
2374         s = *ps++;
2375         m = *pm++;
2376         d = *pd;
2377
2378         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2379         w--;
2380     }
2381
2382     /* call prefetch hint to optimize cache load*/
2383     cache_prefetch ((__m128i*)ps);
2384     cache_prefetch ((__m128i*)pd);
2385     cache_prefetch ((__m128i*)pm);
2386
2387     while (w >= 4)
2388     {
2389         /* fill cache line with next memory */
2390         cache_prefetch_next ((__m128i*)ps);
2391         cache_prefetch_next ((__m128i*)pd);
2392         cache_prefetch_next ((__m128i*)pm);
2393
2394         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2395         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2396         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2397
2398         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2399         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2400         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2401
2402         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2403                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2404         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2405                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2406
2407         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2408                             &xmm_mask_lo, &xmm_mask_hi,
2409                             &xmm_src_lo, &xmm_src_hi);
2410         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2411                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2412                             &xmm_mask_lo, &xmm_mask_hi);
2413
2414         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2415                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2416
2417         pix_add_multiply_2x128 (
2418             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2419             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2420             &xmm_dst_lo, &xmm_dst_hi);
2421
2422         save_128_aligned (
2423             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2424
2425         ps += 4;
2426         pd += 4;
2427         pm += 4;
2428         w -= 4;
2429     }
2430
2431     while (w)
2432     {
2433         s = *ps++;
2434         m = *pm++;
2435         d = *pd;
2436
2437         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2438         w--;
2439     }
2440 }
2441
2442 static force_inline uint32_t
2443 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2444                                 uint32_t mask,
2445                                 uint32_t dst)
2446 {
2447     __m64 a = unpack_32_1x64 (mask);
2448     __m64 s = unpack_32_1x64 (src);
2449     __m64 d = unpack_32_1x64 (dst);
2450
2451     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2452                                        a, expand_alpha_1x64 (s)));
2453     __m64 dest      = pix_multiply_1x64 (s, a);
2454     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2455
2456     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2457                                                 &alpha_dst,
2458                                                 &dest,
2459                                                 &alpha_src));
2460 }
2461
2462 static force_inline void
2463 core_combine_xor_ca_sse2 (uint32_t *      pd,
2464                           const uint32_t *ps,
2465                           const uint32_t *pm,
2466                           int             w)
2467 {
2468     uint32_t s, m, d;
2469
2470     __m128i xmm_src_lo, xmm_src_hi;
2471     __m128i xmm_dst_lo, xmm_dst_hi;
2472     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2473     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2474     __m128i xmm_mask_lo, xmm_mask_hi;
2475
2476     /* call prefetch hint to optimize cache load*/
2477     cache_prefetch ((__m128i*)ps);
2478     cache_prefetch ((__m128i*)pd);
2479     cache_prefetch ((__m128i*)pm);
2480
2481     while (w && (unsigned long)pd & 15)
2482     {
2483         s = *ps++;
2484         m = *pm++;
2485         d = *pd;
2486
2487         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2488         w--;
2489     }
2490
2491     /* call prefetch hint to optimize cache load*/
2492     cache_prefetch ((__m128i*)ps);
2493     cache_prefetch ((__m128i*)pd);
2494     cache_prefetch ((__m128i*)pm);
2495
2496     while (w >= 4)
2497     {
2498         /* fill cache line with next memory */
2499         cache_prefetch_next ((__m128i*)ps);
2500         cache_prefetch_next ((__m128i*)pd);
2501         cache_prefetch_next ((__m128i*)pm);
2502
2503         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2504         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2505         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2506
2507         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2508         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2509         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2510
2511         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2512                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2513         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2514                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2515
2516         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2517                             &xmm_mask_lo, &xmm_mask_hi,
2518                             &xmm_src_lo, &xmm_src_hi);
2519         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2520                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2521                             &xmm_mask_lo, &xmm_mask_hi);
2522
2523         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2524                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2525         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2526                       &xmm_mask_lo, &xmm_mask_hi);
2527
2528         pix_add_multiply_2x128 (
2529             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2530             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2531             &xmm_dst_lo, &xmm_dst_hi);
2532
2533         save_128_aligned (
2534             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2535
2536         ps += 4;
2537         pd += 4;
2538         pm += 4;
2539         w -= 4;
2540     }
2541
2542     while (w)
2543     {
2544         s = *ps++;
2545         m = *pm++;
2546         d = *pd;
2547
2548         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2549         w--;
2550     }
2551 }
2552
2553 static force_inline void
2554 core_combine_add_ca_sse2 (uint32_t *      pd,
2555                           const uint32_t *ps,
2556                           const uint32_t *pm,
2557                           int             w)
2558 {
2559     uint32_t s, m, d;
2560
2561     __m128i xmm_src_lo, xmm_src_hi;
2562     __m128i xmm_dst_lo, xmm_dst_hi;
2563     __m128i xmm_mask_lo, xmm_mask_hi;
2564
2565     /* call prefetch hint to optimize cache load*/
2566     cache_prefetch ((__m128i*)ps);
2567     cache_prefetch ((__m128i*)pd);
2568     cache_prefetch ((__m128i*)pm);
2569
2570     while (w && (unsigned long)pd & 15)
2571     {
2572         s = *ps++;
2573         m = *pm++;
2574         d = *pd;
2575
2576         *pd++ = pack_1x64_32 (
2577             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2578                                              unpack_32_1x64 (m)),
2579                           unpack_32_1x64 (d)));
2580         w--;
2581     }
2582
2583     /* call prefetch hint to optimize cache load*/
2584     cache_prefetch ((__m128i*)ps);
2585     cache_prefetch ((__m128i*)pd);
2586     cache_prefetch ((__m128i*)pm);
2587
2588     while (w >= 4)
2589     {
2590         /* fill cache line with next memory */
2591         cache_prefetch_next ((__m128i*)ps);
2592         cache_prefetch_next ((__m128i*)pd);
2593         cache_prefetch_next ((__m128i*)pm);
2594
2595         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2596         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2597         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2598
2599         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2600         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2601         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2602
2603         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2604                             &xmm_mask_lo, &xmm_mask_hi,
2605                             &xmm_src_lo, &xmm_src_hi);
2606
2607         save_128_aligned (
2608             (__m128i*)pd, pack_2x128_128 (
2609                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2610                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2611
2612         ps += 4;
2613         pd += 4;
2614         pm += 4;
2615         w -= 4;
2616     }
2617
2618     while (w)
2619     {
2620         s = *ps++;
2621         m = *pm++;
2622         d = *pd;
2623
2624         *pd++ = pack_1x64_32 (
2625             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2626                                              unpack_32_1x64 (m)),
2627                           unpack_32_1x64 (d)));
2628         w--;
2629     }
2630 }
2631
2632 /* ---------------------------------------------------
2633  * fb_compose_setup_sSE2
2634  */
2635 static force_inline __m64
2636 create_mask_16_64 (uint16_t mask)
2637 {
2638     return _mm_set1_pi16 (mask);
2639 }
2640
2641 static force_inline __m128i
2642 create_mask_16_128 (uint16_t mask)
2643 {
2644     return _mm_set1_epi16 (mask);
2645 }
2646
2647 static force_inline __m64
2648 create_mask_2x32_64 (uint32_t mask0,
2649                      uint32_t mask1)
2650 {
2651     return _mm_set_pi32 (mask0, mask1);
2652 }
2653
2654 /* Work around a code generation bug in Sun Studio 12. */
2655 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2656 # define create_mask_2x32_128(mask0, mask1)                             \
2657     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2658 #else
2659 static force_inline __m128i
2660 create_mask_2x32_128 (uint32_t mask0,
2661                       uint32_t mask1)
2662 {
2663     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2664 }
2665 #endif
2666
2667 /* SSE2 code patch for fbcompose.c */
2668
2669 static void
2670 sse2_combine_over_u (pixman_implementation_t *imp,
2671                      pixman_op_t              op,
2672                      uint32_t *               dst,
2673                      const uint32_t *         src,
2674                      const uint32_t *         mask,
2675                      int                      width)
2676 {
2677     core_combine_over_u_sse2 (dst, src, mask, width);
2678     _mm_empty ();
2679 }
2680
2681 static void
2682 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2683                              pixman_op_t              op,
2684                              uint32_t *               dst,
2685                              const uint32_t *         src,
2686                              const uint32_t *         mask,
2687                              int                      width)
2688 {
2689     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2690     _mm_empty ();
2691 }
2692
2693 static void
2694 sse2_combine_in_u (pixman_implementation_t *imp,
2695                    pixman_op_t              op,
2696                    uint32_t *               dst,
2697                    const uint32_t *         src,
2698                    const uint32_t *         mask,
2699                    int                      width)
2700 {
2701     core_combine_in_u_sse2 (dst, src, mask, width);
2702     _mm_empty ();
2703 }
2704
2705 static void
2706 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2707                            pixman_op_t              op,
2708                            uint32_t *               dst,
2709                            const uint32_t *         src,
2710                            const uint32_t *         mask,
2711                            int                      width)
2712 {
2713     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2714     _mm_empty ();
2715 }
2716
2717 static void
2718 sse2_combine_out_u (pixman_implementation_t *imp,
2719                     pixman_op_t              op,
2720                     uint32_t *               dst,
2721                     const uint32_t *         src,
2722                     const uint32_t *         mask,
2723                     int                      width)
2724 {
2725     core_combine_out_u_sse2 (dst, src, mask, width);
2726     _mm_empty ();
2727 }
2728
2729 static void
2730 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2731                             pixman_op_t              op,
2732                             uint32_t *               dst,
2733                             const uint32_t *         src,
2734                             const uint32_t *         mask,
2735                             int                      width)
2736 {
2737     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2738     _mm_empty ();
2739 }
2740
2741 static void
2742 sse2_combine_atop_u (pixman_implementation_t *imp,
2743                      pixman_op_t              op,
2744                      uint32_t *               dst,
2745                      const uint32_t *         src,
2746                      const uint32_t *         mask,
2747                      int                      width)
2748 {
2749     core_combine_atop_u_sse2 (dst, src, mask, width);
2750     _mm_empty ();
2751 }
2752
2753 static void
2754 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2755                              pixman_op_t              op,
2756                              uint32_t *               dst,
2757                              const uint32_t *         src,
2758                              const uint32_t *         mask,
2759                              int                      width)
2760 {
2761     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2762     _mm_empty ();
2763 }
2764
2765 static void
2766 sse2_combine_xor_u (pixman_implementation_t *imp,
2767                     pixman_op_t              op,
2768                     uint32_t *               dst,
2769                     const uint32_t *         src,
2770                     const uint32_t *         mask,
2771                     int                      width)
2772 {
2773     core_combine_xor_u_sse2 (dst, src, mask, width);
2774     _mm_empty ();
2775 }
2776
2777 static void
2778 sse2_combine_add_u (pixman_implementation_t *imp,
2779                     pixman_op_t              op,
2780                     uint32_t *               dst,
2781                     const uint32_t *         src,
2782                     const uint32_t *         mask,
2783                     int                      width)
2784 {
2785     core_combine_add_u_sse2 (dst, src, mask, width);
2786     _mm_empty ();
2787 }
2788
2789 static void
2790 sse2_combine_saturate_u (pixman_implementation_t *imp,
2791                          pixman_op_t              op,
2792                          uint32_t *               dst,
2793                          const uint32_t *         src,
2794                          const uint32_t *         mask,
2795                          int                      width)
2796 {
2797     core_combine_saturate_u_sse2 (dst, src, mask, width);
2798     _mm_empty ();
2799 }
2800
2801 static void
2802 sse2_combine_src_ca (pixman_implementation_t *imp,
2803                      pixman_op_t              op,
2804                      uint32_t *               dst,
2805                      const uint32_t *         src,
2806                      const uint32_t *         mask,
2807                      int                      width)
2808 {
2809     core_combine_src_ca_sse2 (dst, src, mask, width);
2810     _mm_empty ();
2811 }
2812
2813 static void
2814 sse2_combine_over_ca (pixman_implementation_t *imp,
2815                       pixman_op_t              op,
2816                       uint32_t *               dst,
2817                       const uint32_t *         src,
2818                       const uint32_t *         mask,
2819                       int                      width)
2820 {
2821     core_combine_over_ca_sse2 (dst, src, mask, width);
2822     _mm_empty ();
2823 }
2824
2825 static void
2826 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2827                               pixman_op_t              op,
2828                               uint32_t *               dst,
2829                               const uint32_t *         src,
2830                               const uint32_t *         mask,
2831                               int                      width)
2832 {
2833     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2834     _mm_empty ();
2835 }
2836
2837 static void
2838 sse2_combine_in_ca (pixman_implementation_t *imp,
2839                     pixman_op_t              op,
2840                     uint32_t *               dst,
2841                     const uint32_t *         src,
2842                     const uint32_t *         mask,
2843                     int                      width)
2844 {
2845     core_combine_in_ca_sse2 (dst, src, mask, width);
2846     _mm_empty ();
2847 }
2848
2849 static void
2850 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2851                             pixman_op_t              op,
2852                             uint32_t *               dst,
2853                             const uint32_t *         src,
2854                             const uint32_t *         mask,
2855                             int                      width)
2856 {
2857     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2858     _mm_empty ();
2859 }
2860
2861 static void
2862 sse2_combine_out_ca (pixman_implementation_t *imp,
2863                      pixman_op_t              op,
2864                      uint32_t *               dst,
2865                      const uint32_t *         src,
2866                      const uint32_t *         mask,
2867                      int                      width)
2868 {
2869     core_combine_out_ca_sse2 (dst, src, mask, width);
2870     _mm_empty ();
2871 }
2872
2873 static void
2874 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2875                              pixman_op_t              op,
2876                              uint32_t *               dst,
2877                              const uint32_t *         src,
2878                              const uint32_t *         mask,
2879                              int                      width)
2880 {
2881     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2882     _mm_empty ();
2883 }
2884
2885 static void
2886 sse2_combine_atop_ca (pixman_implementation_t *imp,
2887                       pixman_op_t              op,
2888                       uint32_t *               dst,
2889                       const uint32_t *         src,
2890                       const uint32_t *         mask,
2891                       int                      width)
2892 {
2893     core_combine_atop_ca_sse2 (dst, src, mask, width);
2894     _mm_empty ();
2895 }
2896
2897 static void
2898 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2899                               pixman_op_t              op,
2900                               uint32_t *               dst,
2901                               const uint32_t *         src,
2902                               const uint32_t *         mask,
2903                               int                      width)
2904 {
2905     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2906     _mm_empty ();
2907 }
2908
2909 static void
2910 sse2_combine_xor_ca (pixman_implementation_t *imp,
2911                      pixman_op_t              op,
2912                      uint32_t *               dst,
2913                      const uint32_t *         src,
2914                      const uint32_t *         mask,
2915                      int                      width)
2916 {
2917     core_combine_xor_ca_sse2 (dst, src, mask, width);
2918     _mm_empty ();
2919 }
2920
2921 static void
2922 sse2_combine_add_ca (pixman_implementation_t *imp,
2923                      pixman_op_t              op,
2924                      uint32_t *               dst,
2925                      const uint32_t *         src,
2926                      const uint32_t *         mask,
2927                      int                      width)
2928 {
2929     core_combine_add_ca_sse2 (dst, src, mask, width);
2930     _mm_empty ();
2931 }
2932
2933 /* -------------------------------------------------------------------
2934  * composite_over_n_8888
2935  */
2936
2937 static void
2938 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2939                             pixman_op_t              op,
2940                             pixman_image_t *         src_image,
2941                             pixman_image_t *         mask_image,
2942                             pixman_image_t *         dst_image,
2943                             int32_t                  src_x,
2944                             int32_t                  src_y,
2945                             int32_t                  mask_x,
2946                             int32_t                  mask_y,
2947                             int32_t                  dest_x,
2948                             int32_t                  dest_y,
2949                             int32_t                  width,
2950                             int32_t                  height)
2951 {
2952     uint32_t src;
2953     uint32_t    *dst_line, *dst, d;
2954     int32_t w;
2955     int dst_stride;
2956     __m128i xmm_src, xmm_alpha;
2957     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2958
2959     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2960
2961     if (src == 0)
2962         return;
2963
2964     PIXMAN_IMAGE_GET_LINE (
2965         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2966
2967     xmm_src = expand_pixel_32_1x128 (src);
2968     xmm_alpha = expand_alpha_1x128 (xmm_src);
2969
2970     while (height--)
2971     {
2972         dst = dst_line;
2973
2974         /* call prefetch hint to optimize cache load*/
2975         cache_prefetch ((__m128i*)dst);
2976
2977         dst_line += dst_stride;
2978         w = width;
2979
2980         while (w && (unsigned long)dst & 15)
2981         {
2982             d = *dst;
2983             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2984                                               _mm_movepi64_pi64 (xmm_alpha),
2985                                               unpack_32_1x64 (d)));
2986             w--;
2987         }
2988
2989         cache_prefetch ((__m128i*)dst);
2990
2991         while (w >= 4)
2992         {
2993             /* fill cache line with next memory */
2994             cache_prefetch_next ((__m128i*)dst);
2995
2996             xmm_dst = load_128_aligned ((__m128i*)dst);
2997
2998             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2999
3000             over_2x128 (&xmm_src, &xmm_src,
3001                         &xmm_alpha, &xmm_alpha,
3002                         &xmm_dst_lo, &xmm_dst_hi);
3003
3004             /* rebuid the 4 pixel data and save*/
3005             save_128_aligned (
3006                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3007
3008             w -= 4;
3009             dst += 4;
3010         }
3011
3012         while (w)
3013         {
3014             d = *dst;
3015             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3016                                               _mm_movepi64_pi64 (xmm_alpha),
3017                                               unpack_32_1x64 (d)));
3018             w--;
3019         }
3020
3021     }
3022     _mm_empty ();
3023 }
3024
3025 /* ---------------------------------------------------------------------
3026  * composite_over_n_0565
3027  */
3028 static void
3029 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3030                             pixman_op_t              op,
3031                             pixman_image_t *         src_image,
3032                             pixman_image_t *         mask_image,
3033                             pixman_image_t *         dst_image,
3034                             int32_t                  src_x,
3035                             int32_t                  src_y,
3036                             int32_t                  mask_x,
3037                             int32_t                  mask_y,
3038                             int32_t                  dest_x,
3039                             int32_t                  dest_y,
3040                             int32_t                  width,
3041                             int32_t                  height)
3042 {
3043     uint32_t src;
3044     uint16_t    *dst_line, *dst, d;
3045     int32_t w;
3046     int dst_stride;
3047     __m128i xmm_src, xmm_alpha;
3048     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3049
3050     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3051
3052     if (src == 0)
3053         return;
3054
3055     PIXMAN_IMAGE_GET_LINE (
3056         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3057
3058     xmm_src = expand_pixel_32_1x128 (src);
3059     xmm_alpha = expand_alpha_1x128 (xmm_src);
3060
3061     while (height--)
3062     {
3063         dst = dst_line;
3064
3065         /* call prefetch hint to optimize cache load*/
3066         cache_prefetch ((__m128i*)dst);
3067
3068         dst_line += dst_stride;
3069         w = width;
3070
3071         while (w && (unsigned long)dst & 15)
3072         {
3073             d = *dst;
3074
3075             *dst++ = pack_565_32_16 (
3076                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3077                                          _mm_movepi64_pi64 (xmm_alpha),
3078                                          expand565_16_1x64 (d))));
3079             w--;
3080         }
3081
3082         /* call prefetch hint to optimize cache load*/
3083         cache_prefetch ((__m128i*)dst);
3084
3085         while (w >= 8)
3086         {
3087             /* fill cache line with next memory */
3088             cache_prefetch_next ((__m128i*)dst);
3089
3090             xmm_dst = load_128_aligned ((__m128i*)dst);
3091
3092             unpack_565_128_4x128 (xmm_dst,
3093                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3094
3095             over_2x128 (&xmm_src, &xmm_src,
3096                         &xmm_alpha, &xmm_alpha,
3097                         &xmm_dst0, &xmm_dst1);
3098             over_2x128 (&xmm_src, &xmm_src,
3099                         &xmm_alpha, &xmm_alpha,
3100                         &xmm_dst2, &xmm_dst3);
3101
3102             xmm_dst = pack_565_4x128_128 (
3103                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3104
3105             save_128_aligned ((__m128i*)dst, xmm_dst);
3106
3107             dst += 8;
3108             w -= 8;
3109         }
3110
3111         while (w--)
3112         {
3113             d = *dst;
3114             *dst++ = pack_565_32_16 (
3115                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3116                                          _mm_movepi64_pi64 (xmm_alpha),
3117                                          expand565_16_1x64 (d))));
3118         }
3119     }
3120
3121     _mm_empty ();
3122 }
3123
3124 /* ------------------------------
3125  * composite_add_n_8888_8888_ca
3126  */
3127 static void
3128 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3129                                    pixman_op_t              op,
3130                                    pixman_image_t *         src_image,
3131                                    pixman_image_t *         mask_image,
3132                                    pixman_image_t *         dst_image,
3133                                    int32_t                  src_x,
3134                                    int32_t                  src_y,
3135                                    int32_t                  mask_x,
3136                                    int32_t                  mask_y,
3137                                    int32_t                  dest_x,
3138                                    int32_t                  dest_y,
3139                                    int32_t                  width,
3140                                    int32_t                  height)
3141 {
3142     uint32_t src, srca;
3143     uint32_t    *dst_line, d;
3144     uint32_t    *mask_line, m;
3145     uint32_t pack_cmp;
3146     int dst_stride, mask_stride;
3147
3148     __m128i xmm_src, xmm_alpha;
3149     __m128i xmm_dst;
3150     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3151
3152     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3153
3154     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3155     srca = src >> 24;
3156
3157     if (src == 0)
3158         return;
3159
3160     PIXMAN_IMAGE_GET_LINE (
3161         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3162     PIXMAN_IMAGE_GET_LINE (
3163         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3164
3165     xmm_src = _mm_unpacklo_epi8 (
3166         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3167     xmm_alpha = expand_alpha_1x128 (xmm_src);
3168     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3169     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3170
3171     while (height--)
3172     {
3173         int w = width;
3174         const uint32_t *pm = (uint32_t *)mask_line;
3175         uint32_t *pd = (uint32_t *)dst_line;
3176
3177         dst_line += dst_stride;
3178         mask_line += mask_stride;
3179
3180         /* call prefetch hint to optimize cache load*/
3181         cache_prefetch ((__m128i*)pd);
3182         cache_prefetch ((__m128i*)pm);
3183
3184         while (w && (unsigned long)pd & 15)
3185         {
3186             m = *pm++;
3187
3188             if (m)
3189             {
3190                 d = *pd;
3191
3192                 mmx_mask = unpack_32_1x64 (m);
3193                 mmx_dest = unpack_32_1x64 (d);
3194
3195                 *pd = pack_1x64_32 (
3196                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3197             }
3198
3199             pd++;
3200             w--;
3201         }
3202
3203         /* call prefetch hint to optimize cache load*/
3204         cache_prefetch ((__m128i*)pd);
3205         cache_prefetch ((__m128i*)pm);
3206
3207         while (w >= 4)
3208         {
3209             /* fill cache line with next memory */
3210             cache_prefetch_next ((__m128i*)pd);
3211             cache_prefetch_next ((__m128i*)pm);
3212
3213             xmm_mask = load_128_unaligned ((__m128i*)pm);
3214
3215             pack_cmp =
3216                 _mm_movemask_epi8 (
3217                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3218
3219             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3220             if (pack_cmp != 0xffff)
3221             {
3222                 xmm_dst = load_128_aligned ((__m128i*)pd);
3223
3224                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3225
3226                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3227                                     &xmm_mask_lo, &xmm_mask_hi,
3228                                     &xmm_mask_lo, &xmm_mask_hi);
3229                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3230
3231                 save_128_aligned (
3232                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3233             }
3234
3235             pd += 4;
3236             pm += 4;
3237             w -= 4;
3238         }
3239
3240         while (w)
3241         {
3242             m = *pm++;
3243
3244             if (m)
3245             {
3246                 d = *pd;
3247
3248                 mmx_mask = unpack_32_1x64 (m);
3249                 mmx_dest = unpack_32_1x64 (d);
3250
3251                 *pd = pack_1x64_32 (
3252                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3253             }
3254
3255             pd++;
3256             w--;
3257         }
3258     }
3259
3260     _mm_empty ();
3261 }
3262
3263 /* ---------------------------------------------------------------------------
3264  * composite_over_n_8888_8888_ca
3265  */
3266
3267 static void
3268 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3269                                     pixman_op_t              op,
3270                                     pixman_image_t *         src_image,
3271                                     pixman_image_t *         mask_image,
3272                                     pixman_image_t *         dst_image,
3273                                     int32_t                  src_x,
3274                                     int32_t                  src_y,
3275                                     int32_t                  mask_x,
3276                                     int32_t                  mask_y,
3277                                     int32_t                  dest_x,
3278                                     int32_t                  dest_y,
3279                                     int32_t                  width,
3280                                     int32_t                  height)
3281 {
3282     uint32_t src;
3283     uint32_t    *dst_line, d;
3284     uint32_t    *mask_line, m;
3285     uint32_t pack_cmp;
3286     int dst_stride, mask_stride;
3287
3288     __m128i xmm_src, xmm_alpha;
3289     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3290     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3291
3292     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3293
3294     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3295
3296     if (src == 0)
3297         return;
3298
3299     PIXMAN_IMAGE_GET_LINE (
3300         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3301     PIXMAN_IMAGE_GET_LINE (
3302         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3303
3304     xmm_src = _mm_unpacklo_epi8 (
3305         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3306     xmm_alpha = expand_alpha_1x128 (xmm_src);
3307     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3308     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3309
3310     while (height--)
3311     {
3312         int w = width;
3313         const uint32_t *pm = (uint32_t *)mask_line;
3314         uint32_t *pd = (uint32_t *)dst_line;
3315
3316         dst_line += dst_stride;
3317         mask_line += mask_stride;
3318
3319         /* call prefetch hint to optimize cache load*/
3320         cache_prefetch ((__m128i*)pd);
3321         cache_prefetch ((__m128i*)pm);
3322
3323         while (w && (unsigned long)pd & 15)
3324         {
3325             m = *pm++;
3326
3327             if (m)
3328             {
3329                 d = *pd;
3330                 mmx_mask = unpack_32_1x64 (m);
3331                 mmx_dest = unpack_32_1x64 (d);
3332
3333                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3334                                                   &mmx_alpha,
3335                                                   &mmx_mask,
3336                                                   &mmx_dest));
3337             }
3338
3339             pd++;
3340             w--;
3341         }
3342
3343         /* call prefetch hint to optimize cache load*/
3344         cache_prefetch ((__m128i*)pd);
3345         cache_prefetch ((__m128i*)pm);
3346
3347         while (w >= 4)
3348         {
3349             /* fill cache line with next memory */
3350             cache_prefetch_next ((__m128i*)pd);
3351             cache_prefetch_next ((__m128i*)pm);
3352
3353             xmm_mask = load_128_unaligned ((__m128i*)pm);
3354
3355             pack_cmp =
3356                 _mm_movemask_epi8 (
3357                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3358
3359             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3360             if (pack_cmp != 0xffff)
3361             {
3362                 xmm_dst = load_128_aligned ((__m128i*)pd);
3363
3364                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3365                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3366
3367                 in_over_2x128 (&xmm_src, &xmm_src,
3368                                &xmm_alpha, &xmm_alpha,
3369                                &xmm_mask_lo, &xmm_mask_hi,
3370                                &xmm_dst_lo, &xmm_dst_hi);
3371
3372                 save_128_aligned (
3373                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3374             }
3375
3376             pd += 4;
3377             pm += 4;
3378             w -= 4;
3379         }
3380
3381         while (w)
3382         {
3383             m = *pm++;
3384
3385             if (m)
3386             {
3387                 d = *pd;
3388                 mmx_mask = unpack_32_1x64 (m);
3389                 mmx_dest = unpack_32_1x64 (d);
3390
3391                 *pd = pack_1x64_32 (
3392                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3393             }
3394
3395             pd++;
3396             w--;
3397         }
3398     }
3399
3400     _mm_empty ();
3401 }
3402
3403 /*---------------------------------------------------------------------
3404  * composite_over_8888_n_8888
3405  */
3406
3407 static void
3408 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3409                                  pixman_op_t              op,
3410                                  pixman_image_t *         src_image,
3411                                  pixman_image_t *         mask_image,
3412                                  pixman_image_t *         dst_image,
3413                                  int32_t                  src_x,
3414                                  int32_t                  src_y,
3415                                  int32_t                  mask_x,
3416                                  int32_t                  mask_y,
3417                                  int32_t                  dest_x,
3418                                  int32_t                  dest_y,
3419                                  int32_t                  width,
3420                                  int32_t                  height)
3421 {
3422     uint32_t    *dst_line, *dst;
3423     uint32_t    *src_line, *src;
3424     uint32_t mask;
3425     int32_t w;
3426     int dst_stride, src_stride;
3427
3428     __m128i xmm_mask;
3429     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3430     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3431     __m128i xmm_alpha_lo, xmm_alpha_hi;
3432
3433     PIXMAN_IMAGE_GET_LINE (
3434         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3435     PIXMAN_IMAGE_GET_LINE (
3436         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3437
3438     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3439
3440     xmm_mask = create_mask_16_128 (mask >> 24);
3441
3442     while (height--)
3443     {
3444         dst = dst_line;
3445         dst_line += dst_stride;
3446         src = src_line;
3447         src_line += src_stride;
3448         w = width;
3449
3450         /* call prefetch hint to optimize cache load*/
3451         cache_prefetch ((__m128i*)dst);
3452         cache_prefetch ((__m128i*)src);
3453
3454         while (w && (unsigned long)dst & 15)
3455         {
3456             uint32_t s = *src++;
3457             uint32_t d = *dst;
3458
3459             __m64 ms = unpack_32_1x64 (s);
3460             __m64 alpha    = expand_alpha_1x64 (ms);
3461             __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3462             __m64 alpha_dst = unpack_32_1x64 (d);
3463
3464             *dst++ = pack_1x64_32 (
3465                 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3466
3467             w--;
3468         }
3469
3470         /* call prefetch hint to optimize cache load*/
3471         cache_prefetch ((__m128i*)dst);
3472         cache_prefetch ((__m128i*)src);
3473
3474         while (w >= 4)
3475         {
3476             /* fill cache line with next memory */
3477             cache_prefetch_next ((__m128i*)dst);
3478             cache_prefetch_next ((__m128i*)src);
3479
3480             xmm_src = load_128_unaligned ((__m128i*)src);
3481             xmm_dst = load_128_aligned ((__m128i*)dst);
3482
3483             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3484             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3485             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3486                                 &xmm_alpha_lo, &xmm_alpha_hi);
3487
3488             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3489                            &xmm_alpha_lo, &xmm_alpha_hi,
3490                            &xmm_mask, &xmm_mask,
3491                            &xmm_dst_lo, &xmm_dst_hi);
3492
3493             save_128_aligned (
3494                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3495
3496             dst += 4;
3497             src += 4;
3498             w -= 4;
3499         }
3500
3501         while (w)
3502         {
3503             uint32_t s = *src++;
3504             uint32_t d = *dst;
3505
3506             __m64 ms = unpack_32_1x64 (s);
3507             __m64 alpha = expand_alpha_1x64 (ms);
3508             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3509             __m64 dest  = unpack_32_1x64 (d);
3510
3511             *dst++ = pack_1x64_32 (
3512                 in_over_1x64 (&ms, &alpha, &mask, &dest));
3513
3514             w--;
3515         }
3516     }
3517
3518     _mm_empty ();
3519 }
3520
3521 /*---------------------------------------------------------------------
3522  * composite_over_8888_n_8888
3523  */
3524
3525 static void
3526 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
3527                               pixman_op_t              op,
3528                               pixman_image_t *         src_image,
3529                               pixman_image_t *         mask_image,
3530                               pixman_image_t *         dst_image,
3531                               int32_t                  src_x,
3532                               int32_t                  src_y,
3533                               int32_t                  mask_x,
3534                               int32_t                  mask_y,
3535                               int32_t                  dest_x,
3536                               int32_t                  dest_y,
3537                               int32_t                  width,
3538                               int32_t                  height)
3539 {
3540     uint32_t    *dst_line, *dst;
3541     uint32_t    *src_line, *src;
3542     int32_t w;
3543     int dst_stride, src_stride;
3544
3545
3546     PIXMAN_IMAGE_GET_LINE (
3547         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3548     PIXMAN_IMAGE_GET_LINE (
3549         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3550
3551     while (height--)
3552     {
3553         dst = dst_line;
3554         dst_line += dst_stride;
3555         src = src_line;
3556         src_line += src_stride;
3557         w = width;
3558
3559         /* call prefetch hint to optimize cache load*/
3560         cache_prefetch ((__m128i*)src);
3561
3562         while (w && (unsigned long)dst & 15)
3563         {
3564             *dst++ = *src++ | 0xff000000;
3565             w--;
3566         }
3567
3568         /* call prefetch hint to optimize cache load*/
3569         cache_prefetch ((__m128i*)src);
3570
3571         while (w >= 16)
3572         {
3573             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
3574             
3575             /* fill cache line with next memory */
3576             cache_prefetch_next ((__m128i*)src);
3577
3578             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
3579             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
3580             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
3581             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
3582             
3583             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
3584             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
3585             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
3586             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
3587             
3588             dst += 16;
3589             src += 16;
3590             w -= 16;
3591         }
3592
3593         while (w)
3594         {
3595             *dst++ = *src++ | 0xff000000;
3596             w--;
3597         }
3598     }
3599
3600     _mm_empty ();
3601 }
3602
3603 /* ---------------------------------------------------------------------
3604  * composite_over_x888_n_8888
3605  */
3606 static void
3607 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3608                                  pixman_op_t              op,
3609                                  pixman_image_t *         src_image,
3610                                  pixman_image_t *         mask_image,
3611                                  pixman_image_t *         dst_image,
3612                                  int32_t                  src_x,
3613                                  int32_t                  src_y,
3614                                  int32_t                  mask_x,
3615                                  int32_t                  mask_y,
3616                                  int32_t                  dest_x,
3617                                  int32_t                  dest_y,
3618                                  int32_t                  width,
3619                                  int32_t                  height)
3620 {
3621     uint32_t    *dst_line, *dst;
3622     uint32_t    *src_line, *src;
3623     uint32_t mask;
3624     int dst_stride, src_stride;
3625     int32_t w;
3626
3627     __m128i xmm_mask, xmm_alpha;
3628     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3629     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3630
3631     PIXMAN_IMAGE_GET_LINE (
3632         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3633     PIXMAN_IMAGE_GET_LINE (
3634         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3635
3636     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3637
3638     xmm_mask = create_mask_16_128 (mask >> 24);
3639     xmm_alpha = mask_00ff;
3640
3641     while (height--)
3642     {
3643         dst = dst_line;
3644         dst_line += dst_stride;
3645         src = src_line;
3646         src_line += src_stride;
3647         w = width;
3648
3649         /* call prefetch hint to optimize cache load*/
3650         cache_prefetch ((__m128i*)dst);
3651         cache_prefetch ((__m128i*)src);
3652
3653         while (w && (unsigned long)dst & 15)
3654         {
3655             uint32_t s = (*src++) | 0xff000000;
3656             uint32_t d = *dst;
3657
3658             __m64 src   = unpack_32_1x64 (s);
3659             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3660             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3661             __m64 dest  = unpack_32_1x64 (d);
3662
3663             *dst++ = pack_1x64_32 (
3664                 in_over_1x64 (&src, &alpha, &mask, &dest));
3665
3666             w--;
3667         }
3668
3669         /* call prefetch hint to optimize cache load*/
3670         cache_prefetch ((__m128i*)dst);
3671         cache_prefetch ((__m128i*)src);
3672
3673         while (w >= 4)
3674         {
3675             /* fill cache line with next memory */
3676             cache_prefetch_next ((__m128i*)dst);
3677             cache_prefetch_next ((__m128i*)src);
3678
3679             xmm_src = _mm_or_si128 (
3680                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3681             xmm_dst = load_128_aligned ((__m128i*)dst);
3682
3683             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3684             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3685
3686             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3687                            &xmm_alpha, &xmm_alpha,
3688                            &xmm_mask, &xmm_mask,
3689                            &xmm_dst_lo, &xmm_dst_hi);
3690
3691             save_128_aligned (
3692                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3693
3694             dst += 4;
3695             src += 4;
3696             w -= 4;
3697
3698         }
3699
3700         while (w)
3701         {
3702             uint32_t s = (*src++) | 0xff000000;
3703             uint32_t d = *dst;
3704
3705             __m64 src  = unpack_32_1x64 (s);
3706             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3707             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3708             __m64 dest  = unpack_32_1x64 (d);
3709
3710             *dst++ = pack_1x64_32 (
3711                 in_over_1x64 (&src, &alpha, &mask, &dest));
3712
3713             w--;
3714         }
3715     }
3716
3717     _mm_empty ();
3718 }
3719
3720 /* --------------------------------------------------------------------
3721  * composite_over_8888_8888
3722  */
3723 static void
3724 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3725                                pixman_op_t              op,
3726                                pixman_image_t *         src_image,
3727                                pixman_image_t *         mask_image,
3728                                pixman_image_t *         dst_image,
3729                                int32_t                  src_x,
3730                                int32_t                  src_y,
3731                                int32_t                  mask_x,
3732                                int32_t                  mask_y,
3733                                int32_t                  dest_x,
3734                                int32_t                  dest_y,
3735                                int32_t                  width,
3736                                int32_t                  height)
3737 {
3738     int dst_stride, src_stride;
3739     uint32_t    *dst_line, *dst;
3740     uint32_t    *src_line, *src;
3741
3742     PIXMAN_IMAGE_GET_LINE (
3743         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3744     PIXMAN_IMAGE_GET_LINE (
3745         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3746
3747     dst = dst_line;
3748     src = src_line;
3749
3750     while (height--)
3751     {
3752         core_combine_over_u_sse2 (dst, src, NULL, width);
3753
3754         dst += dst_stride;
3755         src += src_stride;
3756     }
3757     _mm_empty ();
3758 }
3759
3760 /* ------------------------------------------------------------------
3761  * composite_over_8888_0565
3762  */
3763 static force_inline uint16_t
3764 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3765 {
3766     __m64 ms;
3767
3768     ms = unpack_32_1x64 (src);
3769     return pack_565_32_16 (
3770         pack_1x64_32 (
3771             over_1x64 (
3772                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3773 }
3774
3775 static void
3776 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3777                                pixman_op_t              op,
3778                                pixman_image_t *         src_image,
3779                                pixman_image_t *         mask_image,
3780                                pixman_image_t *         dst_image,
3781                                int32_t                  src_x,
3782                                int32_t                  src_y,
3783                                int32_t                  mask_x,
3784                                int32_t                  mask_y,
3785                                int32_t                  dest_x,
3786                                int32_t                  dest_y,
3787                                int32_t                  width,
3788                                int32_t                  height)
3789 {
3790     uint16_t    *dst_line, *dst, d;
3791     uint32_t    *src_line, *src, s;
3792     int dst_stride, src_stride;
3793     int32_t w;
3794
3795     __m128i xmm_alpha_lo, xmm_alpha_hi;
3796     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3797     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3798
3799     PIXMAN_IMAGE_GET_LINE (
3800         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3801     PIXMAN_IMAGE_GET_LINE (
3802         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3803
3804 #if 0
3805     /* FIXME
3806      *
3807      * I copy the code from MMX one and keep the fixme.
3808      * If it's a problem there, probably is a problem here.
3809      */
3810     assert (src_image->drawable == mask_image->drawable);
3811 #endif
3812
3813     while (height--)
3814     {
3815         dst = dst_line;
3816         src = src_line;
3817
3818         /* call prefetch hint to optimize cache load*/
3819         cache_prefetch ((__m128i*)src);
3820         cache_prefetch ((__m128i*)dst);
3821
3822         dst_line += dst_stride;
3823         src_line += src_stride;
3824         w = width;
3825
3826         /* Align dst on a 16-byte boundary */
3827         while (w &&
3828                ((unsigned long)dst & 15))
3829         {
3830             s = *src++;
3831             d = *dst;
3832
3833             *dst++ = composite_over_8888_0565pixel (s, d);
3834             w--;
3835         }
3836
3837         /* call prefetch hint to optimize cache load*/
3838         cache_prefetch ((__m128i*)src);
3839         cache_prefetch ((__m128i*)dst);
3840
3841         /* It's a 8 pixel loop */
3842         while (w >= 8)
3843         {
3844             /* fill cache line with next memory */
3845             cache_prefetch_next ((__m128i*)src);
3846             cache_prefetch_next ((__m128i*)dst);
3847
3848             /* I'm loading unaligned because I'm not sure
3849              * about the address alignment.
3850              */
3851             xmm_src = load_128_unaligned ((__m128i*) src);
3852             xmm_dst = load_128_aligned ((__m128i*) dst);
3853
3854             /* Unpacking */
3855             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3856             unpack_565_128_4x128 (xmm_dst,
3857                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3858             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3859                                 &xmm_alpha_lo, &xmm_alpha_hi);
3860
3861             /* I'm loading next 4 pixels from memory
3862              * before to optimze the memory read.
3863              */
3864             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3865
3866             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3867                         &xmm_alpha_lo, &xmm_alpha_hi,
3868                         &xmm_dst0, &xmm_dst1);
3869
3870             /* Unpacking */
3871             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3872             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3873                                 &xmm_alpha_lo, &xmm_alpha_hi);
3874
3875             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3876                         &xmm_alpha_lo, &xmm_alpha_hi,
3877                         &xmm_dst2, &xmm_dst3);
3878
3879             save_128_aligned (
3880                 (__m128i*)dst, pack_565_4x128_128 (
3881                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3882
3883             w -= 8;
3884             dst += 8;
3885             src += 8;
3886         }
3887
3888         while (w--)
3889         {
3890             s = *src++;
3891             d = *dst;
3892
3893             *dst++ = composite_over_8888_0565pixel (s, d);
3894         }
3895     }
3896
3897     _mm_empty ();
3898 }
3899
3900 /* -----------------------------------------------------------------
3901  * composite_over_n_8_8888
3902  */
3903
3904 static void
3905 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3906                               pixman_op_t              op,
3907                               pixman_image_t *         src_image,
3908                               pixman_image_t *         mask_image,
3909                               pixman_image_t *         dst_image,
3910                               int32_t                  src_x,
3911                               int32_t                  src_y,
3912                               int32_t                  mask_x,
3913                               int32_t                  mask_y,
3914                               int32_t                  dest_x,
3915                               int32_t                  dest_y,
3916                               int32_t                  width,
3917                               int32_t                  height)
3918 {
3919     uint32_t src, srca;
3920     uint32_t *dst_line, *dst;
3921     uint8_t *mask_line, *mask;
3922     int dst_stride, mask_stride;
3923     int32_t w;
3924     uint32_t m, d;
3925
3926     __m128i xmm_src, xmm_alpha, xmm_def;
3927     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3928     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3929
3930     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3931
3932     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3933
3934     srca = src >> 24;
3935     if (src == 0)
3936         return;
3937
3938     PIXMAN_IMAGE_GET_LINE (
3939         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3940     PIXMAN_IMAGE_GET_LINE (
3941         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3942
3943     xmm_def = create_mask_2x32_128 (src, src);
3944     xmm_src = expand_pixel_32_1x128 (src);
3945     xmm_alpha = expand_alpha_1x128 (xmm_src);
3946     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3947     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3948
3949     while (height--)
3950     {
3951         dst = dst_line;
3952         dst_line += dst_stride;
3953         mask = mask_line;
3954         mask_line += mask_stride;
3955         w = width;
3956
3957         /* call prefetch hint to optimize cache load*/
3958         cache_prefetch ((__m128i*)mask);
3959         cache_prefetch ((__m128i*)dst);
3960
3961         while (w && (unsigned long)dst & 15)
3962         {
3963             uint8_t m = *mask++;
3964
3965             if (m)
3966             {
3967                 d = *dst;
3968                 mmx_mask = expand_pixel_8_1x64 (m);
3969                 mmx_dest = unpack_32_1x64 (d);
3970
3971                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3972                                                    &mmx_alpha,
3973                                                    &mmx_mask,
3974                                                    &mmx_dest));
3975             }
3976
3977             w--;
3978             dst++;
3979         }
3980
3981         /* call prefetch hint to optimize cache load*/
3982         cache_prefetch ((__m128i*)mask);
3983         cache_prefetch ((__m128i*)dst);
3984
3985         while (w >= 4)
3986         {
3987             /* fill cache line with next memory */
3988             cache_prefetch_next ((__m128i*)mask);
3989             cache_prefetch_next ((__m128i*)dst);
3990
3991             m = *((uint32_t*)mask);
3992
3993             if (srca == 0xff && m == 0xffffffff)
3994             {
3995                 save_128_aligned ((__m128i*)dst, xmm_def);
3996             }
3997             else if (m)
3998             {
3999                 xmm_dst = load_128_aligned ((__m128i*) dst);
4000                 xmm_mask = unpack_32_1x128 (m);
4001                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4002
4003                 /* Unpacking */
4004                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4005                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4006
4007                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4008                                         &xmm_mask_lo, &xmm_mask_hi);
4009
4010                 in_over_2x128 (&xmm_src, &xmm_src,
4011                                &xmm_alpha, &xmm_alpha,
4012                                &xmm_mask_lo, &xmm_mask_hi,
4013                                &xmm_dst_lo, &xmm_dst_hi);
4014
4015                 save_128_aligned (
4016                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4017             }
4018
4019             w -= 4;
4020             dst += 4;
4021             mask += 4;
4022         }
4023
4024         while (w)
4025         {
4026             uint8_t m = *mask++;
4027
4028             if (m)
4029             {
4030                 d = *dst;
4031                 mmx_mask = expand_pixel_8_1x64 (m);
4032                 mmx_dest = unpack_32_1x64 (d);
4033
4034                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
4035                                                    &mmx_alpha,
4036                                                    &mmx_mask,
4037                                                    &mmx_dest));
4038             }
4039
4040             w--;
4041             dst++;
4042         }
4043     }
4044
4045     _mm_empty ();
4046 }
4047
4048 /* ----------------------------------------------------------------
4049  * composite_over_n_8_8888
4050  */
4051
4052 pixman_bool_t
4053 pixman_fill_sse2 (uint32_t *bits,
4054                   int       stride,
4055                   int       bpp,
4056                   int       x,
4057                   int       y,
4058                   int       width,
4059                   int       height,
4060                   uint32_t  data)
4061 {
4062     uint32_t byte_width;
4063     uint8_t         *byte_line;
4064
4065     __m128i xmm_def;
4066
4067     if (bpp == 8)
4068     {
4069         uint8_t b;
4070         uint16_t w;
4071
4072         stride = stride * (int) sizeof (uint32_t) / 1;
4073         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
4074         byte_width = width;
4075         stride *= 1;
4076
4077         b = data & 0xff;
4078         w = (b << 8) | b;
4079         data = (w << 16) | w;
4080     }
4081     else if (bpp == 16)
4082     {
4083         stride = stride * (int) sizeof (uint32_t) / 2;
4084         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
4085         byte_width = 2 * width;
4086         stride *= 2;
4087
4088         data = (data & 0xffff) * 0x00010001;
4089     }
4090     else if (bpp == 32)
4091     {
4092         stride = stride * (int) sizeof (uint32_t) / 4;
4093         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
4094         byte_width = 4 * width;
4095         stride *= 4;
4096     }
4097     else
4098     {
4099         return FALSE;
4100     }
4101
4102     cache_prefetch ((__m128i*)byte_line);
4103     xmm_def = create_mask_2x32_128 (data, data);
4104
4105     while (height--)
4106     {
4107         int w;
4108         uint8_t *d = byte_line;
4109         byte_line += stride;
4110         w = byte_width;
4111
4112         cache_prefetch_next ((__m128i*)d);
4113
4114         while (w >= 1 && ((unsigned long)d & 1))
4115         {
4116             *(uint8_t *)d = data;
4117             w -= 1;
4118             d += 1;
4119         }
4120
4121         while (w >= 2 && ((unsigned long)d & 3))
4122         {
4123             *(uint16_t *)d = data;
4124             w -= 2;
4125             d += 2;
4126         }
4127
4128         while (w >= 4 && ((unsigned long)d & 15))
4129         {
4130             *(uint32_t *)d = data;
4131
4132             w -= 4;
4133             d += 4;
4134         }
4135
4136         cache_prefetch_next ((__m128i*)d);
4137
4138         while (w >= 128)
4139         {
4140             cache_prefetch (((__m128i*)d) + 12);
4141
4142             save_128_aligned ((__m128i*)(d),     xmm_def);
4143             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4144             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4145             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4146             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
4147             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
4148             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
4149             save_128_aligned ((__m128i*)(d + 112), xmm_def);
4150
4151             d += 128;
4152             w -= 128;
4153         }
4154
4155         if (w >= 64)
4156         {
4157             cache_prefetch (((__m128i*)d) + 8);
4158
4159             save_128_aligned ((__m128i*)(d),     xmm_def);
4160             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4161             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4162             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4163
4164             d += 64;
4165             w -= 64;
4166         }
4167
4168         cache_prefetch_next ((__m128i*)d);
4169
4170         if (w >= 32)
4171         {
4172             save_128_aligned ((__m128i*)(d),     xmm_def);
4173             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4174
4175             d += 32;
4176             w -= 32;
4177         }
4178
4179         if (w >= 16)
4180         {
4181             save_128_aligned ((__m128i*)(d),     xmm_def);
4182
4183             d += 16;
4184             w -= 16;
4185         }
4186
4187         cache_prefetch_next ((__m128i*)d);
4188
4189         while (w >= 4)
4190         {
4191             *(uint32_t *)d = data;
4192
4193             w -= 4;
4194             d += 4;
4195         }
4196
4197         if (w >= 2)
4198         {
4199             *(uint16_t *)d = data;
4200             w -= 2;
4201             d += 2;
4202         }
4203
4204         if (w >= 1)
4205         {
4206             *(uint8_t *)d = data;
4207             w -= 1;
4208             d += 1;
4209         }
4210     }
4211
4212     _mm_empty ();
4213     return TRUE;
4214 }
4215
4216 static void
4217 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4218                              pixman_op_t              op,
4219                              pixman_image_t *         src_image,
4220                              pixman_image_t *         mask_image,
4221                              pixman_image_t *         dst_image,
4222                              int32_t                  src_x,
4223                              int32_t                  src_y,
4224                              int32_t                  mask_x,
4225                              int32_t                  mask_y,
4226                              int32_t                  dest_x,
4227                              int32_t                  dest_y,
4228                              int32_t                  width,
4229                              int32_t                  height)
4230 {
4231     uint32_t src, srca;
4232     uint32_t    *dst_line, *dst;
4233     uint8_t     *mask_line, *mask;
4234     int dst_stride, mask_stride;
4235     int32_t w;
4236     uint32_t m;
4237
4238     __m128i xmm_src, xmm_def;
4239     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4240
4241     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4242
4243     srca = src >> 24;
4244     if (src == 0)
4245     {
4246         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4247                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
4248                           dest_x, dest_y, width, height, 0);
4249         return;
4250     }
4251
4252     PIXMAN_IMAGE_GET_LINE (
4253         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4254     PIXMAN_IMAGE_GET_LINE (
4255         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4256
4257     xmm_def = create_mask_2x32_128 (src, src);
4258     xmm_src = expand_pixel_32_1x128 (src);
4259
4260     while (height--)
4261     {
4262         dst = dst_line;
4263         dst_line += dst_stride;
4264         mask = mask_line;
4265         mask_line += mask_stride;
4266         w = width;
4267
4268         /* call prefetch hint to optimize cache load*/
4269         cache_prefetch ((__m128i*)mask);
4270         cache_prefetch ((__m128i*)dst);
4271
4272         while (w && (unsigned long)dst & 15)
4273         {
4274             uint8_t m = *mask++;
4275
4276             if (m)
4277             {
4278                 *dst = pack_1x64_32 (
4279                     pix_multiply_1x64 (
4280                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4281             }
4282             else
4283             {
4284                 *dst = 0;
4285             }
4286
4287             w--;
4288             dst++;
4289         }
4290
4291         /* call prefetch hint to optimize cache load*/
4292         cache_prefetch ((__m128i*)mask);
4293         cache_prefetch ((__m128i*)dst);
4294
4295         while (w >= 4)
4296         {
4297             /* fill cache line with next memory */
4298             cache_prefetch_next ((__m128i*)mask);
4299             cache_prefetch_next ((__m128i*)dst);
4300
4301             m = *((uint32_t*)mask);
4302
4303             if (srca == 0xff && m == 0xffffffff)
4304             {
4305                 save_128_aligned ((__m128i*)dst, xmm_def);
4306             }
4307             else if (m)
4308             {
4309                 xmm_mask = unpack_32_1x128 (m);
4310                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4311
4312                 /* Unpacking */
4313                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4314
4315                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4316                                         &xmm_mask_lo, &xmm_mask_hi);
4317
4318                 pix_multiply_2x128 (&xmm_src, &xmm_src,
4319                                     &xmm_mask_lo, &xmm_mask_hi,
4320                                     &xmm_mask_lo, &xmm_mask_hi);
4321
4322                 save_128_aligned (
4323                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4324             }
4325             else
4326             {
4327                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4328             }
4329
4330             w -= 4;
4331             dst += 4;
4332             mask += 4;
4333         }
4334
4335         while (w)
4336         {
4337             uint8_t m = *mask++;
4338
4339             if (m)
4340             {
4341                 *dst = pack_1x64_32 (
4342                     pix_multiply_1x64 (
4343                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4344             }
4345             else
4346             {
4347                 *dst = 0;
4348             }
4349
4350             w--;
4351             dst++;
4352         }
4353     }
4354
4355     _mm_empty ();
4356 }
4357
4358 /*-----------------------------------------------------------------------
4359  * composite_over_n_8_0565
4360  */
4361
4362 static void
4363 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4364                               pixman_op_t              op,
4365                               pixman_image_t *         src_image,
4366                               pixman_image_t *         mask_image,
4367                               pixman_image_t *         dst_image,
4368                               int32_t                  src_x,
4369                               int32_t                  src_y,
4370                               int32_t                  mask_x,
4371                               int32_t                  mask_y,
4372                               int32_t                  dest_x,
4373                               int32_t                  dest_y,
4374                               int32_t                  width,
4375                               int32_t                  height)
4376 {
4377     uint32_t src, srca;
4378     uint16_t    *dst_line, *dst, d;
4379     uint8_t     *mask_line, *mask;
4380     int dst_stride, mask_stride;
4381     int32_t w;
4382     uint32_t m;
4383     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4384
4385     __m128i xmm_src, xmm_alpha;
4386     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4387     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4388
4389     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4390
4391     srca = src >> 24;
4392     if (src == 0)
4393         return;
4394
4395     PIXMAN_IMAGE_GET_LINE (
4396         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4397     PIXMAN_IMAGE_GET_LINE (
4398         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4399
4400     xmm_src = expand_pixel_32_1x128 (src);
4401     xmm_alpha = expand_alpha_1x128 (xmm_src);
4402     mmx_src = _mm_movepi64_pi64 (xmm_src);
4403     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4404
4405     while (height--)
4406     {
4407         dst = dst_line;
4408         dst_line += dst_stride;
4409         mask = mask_line;
4410         mask_line += mask_stride;
4411         w = width;
4412
4413         /* call prefetch hint to optimize cache load*/
4414         cache_prefetch ((__m128i*)mask);
4415         cache_prefetch ((__m128i*)dst);
4416
4417         while (w && (unsigned long)dst & 15)
4418         {
4419             m = *mask++;
4420
4421             if (m)
4422             {
4423                 d = *dst;
4424                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4425                 mmx_dest = expand565_16_1x64 (d);
4426
4427                 *dst = pack_565_32_16 (
4428                     pack_1x64_32 (
4429                         in_over_1x64 (
4430                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4431             }
4432
4433             w--;
4434             dst++;
4435         }
4436
4437         /* call prefetch hint to optimize cache load*/
4438         cache_prefetch ((__m128i*)mask);
4439         cache_prefetch ((__m128i*)dst);
4440
4441         while (w >= 8)
4442         {
4443             /* fill cache line with next memory */
4444             cache_prefetch_next ((__m128i*)mask);
4445             cache_prefetch_next ((__m128i*)dst);
4446
4447             xmm_dst = load_128_aligned ((__m128i*) dst);
4448             unpack_565_128_4x128 (xmm_dst,
4449                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4450
4451             m = *((uint32_t*)mask);
4452             mask += 4;
4453
4454             if (m)
4455             {
4456                 xmm_mask = unpack_32_1x128 (m);
4457                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4458
4459                 /* Unpacking */
4460                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4461
4462                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4463                                         &xmm_mask_lo, &xmm_mask_hi);
4464
4465                 in_over_2x128 (&xmm_src, &xmm_src,
4466                                &xmm_alpha, &xmm_alpha,
4467                                &xmm_mask_lo, &xmm_mask_hi,
4468                                &xmm_dst0, &xmm_dst1);
4469             }
4470
4471             m = *((uint32_t*)mask);
4472             mask += 4;
4473
4474             if (m)
4475             {
4476                 xmm_mask = unpack_32_1x128 (m);
4477                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4478
4479                 /* Unpacking */
4480                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4481
4482                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4483                                         &xmm_mask_lo, &xmm_mask_hi);
4484                 in_over_2x128 (&xmm_src, &xmm_src,
4485                                &xmm_alpha, &xmm_alpha,
4486                                &xmm_mask_lo, &xmm_mask_hi,
4487                                &xmm_dst2, &xmm_dst3);
4488             }
4489
4490             save_128_aligned (
4491                 (__m128i*)dst, pack_565_4x128_128 (
4492                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4493
4494             w -= 8;
4495             dst += 8;
4496         }
4497
4498         while (w)
4499         {
4500             m = *mask++;
4501
4502             if (m)
4503             {
4504                 d = *dst;
4505                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4506                 mmx_dest = expand565_16_1x64 (d);
4507
4508                 *dst = pack_565_32_16 (
4509                     pack_1x64_32 (
4510                         in_over_1x64 (
4511                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4512             }
4513
4514             w--;
4515             dst++;
4516         }
4517     }
4518
4519     _mm_empty ();
4520 }
4521
4522 /* -----------------------------------------------------------------------
4523  * composite_over_pixbuf_0565
4524  */
4525
4526 static void
4527 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4528                                  pixman_op_t              op,
4529                                  pixman_image_t *         src_image,
4530                                  pixman_image_t *         mask_image,
4531                                  pixman_image_t *         dst_image,
4532                                  int32_t                  src_x,
4533                                  int32_t                  src_y,
4534                                  int32_t                  mask_x,
4535                                  int32_t                  mask_y,
4536                                  int32_t                  dest_x,
4537                                  int32_t                  dest_y,
4538                                  int32_t                  width,
4539                                  int32_t                  height)
4540 {
4541     uint16_t    *dst_line, *dst, d;
4542     uint32_t    *src_line, *src, s;
4543     int dst_stride, src_stride;
4544     int32_t w;
4545     uint32_t opaque, zero;
4546
4547     __m64 ms;
4548     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4549     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4550
4551     PIXMAN_IMAGE_GET_LINE (
4552         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4553     PIXMAN_IMAGE_GET_LINE (
4554         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4555
4556 #if 0
4557     /* FIXME
4558      *
4559      * I copy the code from MMX one and keep the fixme.
4560      * If it's a problem there, probably is a problem here.
4561      */
4562     assert (src_image->drawable == mask_image->drawable);
4563 #endif
4564
4565     while (height--)
4566     {
4567         dst = dst_line;
4568         dst_line += dst_stride;
4569         src = src_line;
4570         src_line += src_stride;
4571         w = width;
4572
4573         /* call prefetch hint to optimize cache load*/
4574         cache_prefetch ((__m128i*)src);
4575         cache_prefetch ((__m128i*)dst);
4576
4577         while (w && (unsigned long)dst & 15)
4578         {
4579             s = *src++;
4580             d = *dst;
4581
4582             ms = unpack_32_1x64 (s);
4583
4584             *dst++ = pack_565_32_16 (
4585                 pack_1x64_32 (
4586                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4587             w--;
4588         }
4589
4590         /* call prefetch hint to optimize cache load*/
4591         cache_prefetch ((__m128i*)src);
4592         cache_prefetch ((__m128i*)dst);
4593
4594         while (w >= 8)
4595         {
4596             /* fill cache line with next memory */
4597             cache_prefetch_next ((__m128i*)src);
4598             cache_prefetch_next ((__m128i*)dst);
4599
4600             /* First round */
4601             xmm_src = load_128_unaligned ((__m128i*)src);
4602             xmm_dst = load_128_aligned  ((__m128i*)dst);
4603
4604             opaque = is_opaque (xmm_src);
4605             zero = is_zero (xmm_src);
4606
4607             unpack_565_128_4x128 (xmm_dst,
4608                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4609             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4610
4611             /* preload next round*/
4612             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4613
4614             if (opaque)
4615             {
4616                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4617                                      &xmm_dst0, &xmm_dst1);
4618             }
4619             else if (!zero)
4620             {
4621                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4622                                         &xmm_dst0, &xmm_dst1);
4623             }
4624
4625             /* Second round */
4626             opaque = is_opaque (xmm_src);
4627             zero = is_zero (xmm_src);
4628
4629             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4630
4631             if (opaque)
4632             {
4633                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4634                                      &xmm_dst2, &xmm_dst3);
4635             }
4636             else if (!zero)
4637             {
4638                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4639                                         &xmm_dst2, &xmm_dst3);
4640             }
4641
4642             save_128_aligned (
4643                 (__m128i*)dst, pack_565_4x128_128 (
4644                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4645
4646             w -= 8;
4647             src += 8;
4648             dst += 8;
4649         }
4650
4651         while (w)
4652         {
4653             s = *src++;
4654             d = *dst;
4655
4656             ms = unpack_32_1x64 (s);
4657
4658             *dst++ = pack_565_32_16 (
4659                 pack_1x64_32 (
4660                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4661             w--;
4662         }
4663     }
4664
4665     _mm_empty ();
4666 }
4667
4668 /* -------------------------------------------------------------------------
4669  * composite_over_pixbuf_8888
4670  */
4671
4672 static void
4673 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4674                                  pixman_op_t              op,
4675                                  pixman_image_t *         src_image,
4676                                  pixman_image_t *         mask_image,
4677                                  pixman_image_t *         dst_image,
4678                                  int32_t                  src_x,
4679                                  int32_t                  src_y,
4680                                  int32_t                  mask_x,
4681                                  int32_t                  mask_y,
4682                                  int32_t                  dest_x,
4683                                  int32_t                  dest_y,
4684                                  int32_t                  width,
4685                                  int32_t                  height)
4686 {
4687     uint32_t    *dst_line, *dst, d;
4688     uint32_t    *src_line, *src, s;
4689     int dst_stride, src_stride;
4690     int32_t w;
4691     uint32_t opaque, zero;
4692
4693     __m128i xmm_src_lo, xmm_src_hi;
4694     __m128i xmm_dst_lo, xmm_dst_hi;
4695
4696     PIXMAN_IMAGE_GET_LINE (
4697         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4698     PIXMAN_IMAGE_GET_LINE (
4699         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4700
4701 #if 0
4702     /* FIXME
4703      *
4704      * I copy the code from MMX one and keep the fixme.
4705      * If it's a problem there, probably is a problem here.
4706      */
4707     assert (src_image->drawable == mask_image->drawable);
4708 #endif
4709
4710     while (height--)
4711     {
4712         dst = dst_line;
4713         dst_line += dst_stride;
4714         src = src_line;
4715         src_line += src_stride;
4716         w = width;
4717
4718         /* call prefetch hint to optimize cache load*/
4719         cache_prefetch ((__m128i*)src);
4720         cache_prefetch ((__m128i*)dst);
4721
4722         while (w && (unsigned long)dst & 15)
4723         {
4724             s = *src++;
4725             d = *dst;
4726
4727             *dst++ = pack_1x64_32 (
4728                 over_rev_non_pre_1x64 (
4729                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4730
4731             w--;
4732         }
4733
4734         /* call prefetch hint to optimize cache load*/
4735         cache_prefetch ((__m128i*)src);
4736         cache_prefetch ((__m128i*)dst);
4737
4738         while (w >= 4)
4739         {
4740             /* fill cache line with next memory */
4741             cache_prefetch_next ((__m128i*)src);
4742             cache_prefetch_next ((__m128i*)dst);
4743
4744             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4745
4746             opaque = is_opaque (xmm_src_hi);
4747             zero = is_zero (xmm_src_hi);
4748
4749             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4750
4751             if (opaque)
4752             {
4753                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4754                                      &xmm_dst_lo, &xmm_dst_hi);
4755
4756                 save_128_aligned (
4757                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4758             }
4759             else if (!zero)
4760             {
4761                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4762
4763                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4764
4765                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4766                                         &xmm_dst_lo, &xmm_dst_hi);
4767
4768                 save_128_aligned (
4769                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4770             }
4771
4772             w -= 4;
4773             dst += 4;
4774             src += 4;
4775         }
4776
4777         while (w)
4778         {
4779             s = *src++;
4780             d = *dst;
4781
4782             *dst++ = pack_1x64_32 (
4783                 over_rev_non_pre_1x64 (
4784                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4785
4786             w--;
4787         }
4788     }
4789
4790     _mm_empty ();
4791 }
4792
4793 /* -------------------------------------------------------------------------------------------------
4794  * composite_over_n_8888_0565_ca
4795  */
4796
4797 static void
4798 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4799                                     pixman_op_t              op,
4800                                     pixman_image_t *         src_image,
4801                                     pixman_image_t *         mask_image,
4802                                     pixman_image_t *         dst_image,
4803                                     int32_t                  src_x,
4804                                     int32_t                  src_y,
4805                                     int32_t                  mask_x,
4806                                     int32_t                  mask_y,
4807                                     int32_t                  dest_x,
4808                                     int32_t                  dest_y,
4809                                     int32_t                  width,
4810                                     int32_t                  height)
4811 {
4812     uint32_t src;
4813     uint16_t    *dst_line, *dst, d;
4814     uint32_t    *mask_line, *mask, m;
4815     int dst_stride, mask_stride;
4816     int w;
4817     uint32_t pack_cmp;
4818
4819     __m128i xmm_src, xmm_alpha;
4820     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4821     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4822
4823     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4824
4825     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4826
4827     if (src == 0)
4828         return;
4829
4830     PIXMAN_IMAGE_GET_LINE (
4831         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4832     PIXMAN_IMAGE_GET_LINE (
4833         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4834
4835     xmm_src = expand_pixel_32_1x128 (src);
4836     xmm_alpha = expand_alpha_1x128 (xmm_src);
4837     mmx_src = _mm_movepi64_pi64 (xmm_src);
4838     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4839
4840     while (height--)
4841     {
4842         w = width;
4843         mask = mask_line;
4844         dst = dst_line;
4845         mask_line += mask_stride;
4846         dst_line += dst_stride;
4847
4848         /* call prefetch hint to optimize cache load*/
4849         cache_prefetch ((__m128i*)mask);
4850         cache_prefetch ((__m128i*)dst);
4851
4852         while (w && ((unsigned long)dst & 15))
4853         {
4854             m = *(uint32_t *) mask;
4855
4856             if (m)
4857             {
4858                 d = *dst;
4859                 mmx_mask = unpack_32_1x64 (m);
4860                 mmx_dest = expand565_16_1x64 (d);
4861
4862                 *dst = pack_565_32_16 (
4863                     pack_1x64_32 (
4864                         in_over_1x64 (
4865                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4866             }
4867
4868             w--;
4869             dst++;
4870             mask++;
4871         }
4872
4873         /* call prefetch hint to optimize cache load*/
4874         cache_prefetch ((__m128i*)mask);
4875         cache_prefetch ((__m128i*)dst);
4876
4877         while (w >= 8)
4878         {
4879             /* fill cache line with next memory */
4880             cache_prefetch_next ((__m128i*)mask);
4881             cache_prefetch_next ((__m128i*)dst);
4882
4883             /* First round */
4884             xmm_mask = load_128_unaligned ((__m128i*)mask);
4885             xmm_dst = load_128_aligned ((__m128i*)dst);
4886
4887             pack_cmp = _mm_movemask_epi8 (
4888                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4889
4890             unpack_565_128_4x128 (xmm_dst,
4891                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4892             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4893
4894             /* preload next round */
4895             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4896
4897             /* preload next round */
4898             if (pack_cmp != 0xffff)
4899             {
4900                 in_over_2x128 (&xmm_src, &xmm_src,
4901                                &xmm_alpha, &xmm_alpha,
4902                                &xmm_mask_lo, &xmm_mask_hi,
4903                                &xmm_dst0, &xmm_dst1);
4904             }
4905
4906             /* Second round */
4907             pack_cmp = _mm_movemask_epi8 (
4908                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4909
4910             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4911
4912             if (pack_cmp != 0xffff)
4913             {
4914                 in_over_2x128 (&xmm_src, &xmm_src,
4915                                &xmm_alpha, &xmm_alpha,
4916                                &xmm_mask_lo, &xmm_mask_hi,
4917                                &xmm_dst2, &xmm_dst3);
4918             }
4919
4920             save_128_aligned (
4921                 (__m128i*)dst, pack_565_4x128_128 (
4922                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4923
4924             w -= 8;
4925             dst += 8;
4926             mask += 8;
4927         }
4928
4929         while (w)
4930         {
4931             m = *(uint32_t *) mask;
4932
4933             if (m)
4934             {
4935                 d = *dst;
4936                 mmx_mask = unpack_32_1x64 (m);
4937                 mmx_dest = expand565_16_1x64 (d);
4938
4939                 *dst = pack_565_32_16 (
4940                     pack_1x64_32 (
4941                         in_over_1x64 (
4942                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4943             }
4944
4945             w--;
4946             dst++;
4947             mask++;
4948         }
4949     }
4950
4951     _mm_empty ();
4952 }
4953
4954 /* -----------------------------------------------------------------------
4955  * composite_in_n_8_8
4956  */
4957
4958 static void
4959 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4960                          pixman_op_t              op,
4961                          pixman_image_t *         src_image,
4962                          pixman_image_t *         mask_image,
4963                          pixman_image_t *         dst_image,
4964                          int32_t                  src_x,
4965                          int32_t                  src_y,
4966                          int32_t                  mask_x,
4967                          int32_t                  mask_y,
4968                          int32_t                  dest_x,
4969                          int32_t                  dest_y,
4970                          int32_t                  width,
4971                          int32_t                  height)
4972 {
4973     uint8_t     *dst_line, *dst;
4974     uint8_t     *mask_line, *mask;
4975     int dst_stride, mask_stride;
4976     uint32_t d, m;
4977     uint32_t src;
4978     uint8_t sa;
4979     int32_t w;
4980
4981     __m128i xmm_alpha;
4982     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4983     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4984
4985     PIXMAN_IMAGE_GET_LINE (
4986         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4987     PIXMAN_IMAGE_GET_LINE (
4988         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4989
4990     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4991
4992     sa = src >> 24;
4993
4994     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4995
4996     while (height--)
4997     {
4998         dst = dst_line;
4999         dst_line += dst_stride;
5000         mask = mask_line;
5001         mask_line += mask_stride;
5002         w = width;
5003
5004         /* call prefetch hint to optimize cache load*/
5005         cache_prefetch ((__m128i*)mask);
5006         cache_prefetch ((__m128i*)dst);
5007
5008         while (w && ((unsigned long)dst & 15))
5009         {
5010             m = (uint32_t) *mask++;
5011             d = (uint32_t) *dst;
5012
5013             *dst++ = (uint8_t) pack_1x64_32 (
5014                 pix_multiply_1x64 (
5015                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
5016                                        unpack_32_1x64 (m)),
5017                     unpack_32_1x64 (d)));
5018             w--;
5019         }
5020
5021         /* call prefetch hint to optimize cache load*/
5022         cache_prefetch ((__m128i*)mask);
5023         cache_prefetch ((__m128i*)dst);
5024
5025         while (w >= 16)
5026         {
5027             /* fill cache line with next memory */
5028             cache_prefetch_next ((__m128i*)mask);
5029             cache_prefetch_next ((__m128i*)dst);
5030
5031             xmm_mask = load_128_unaligned ((__m128i*)mask);
5032             xmm_dst = load_128_aligned ((__m128i*)dst);
5033
5034             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5035             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5036
5037             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5038                                 &xmm_mask_lo, &xmm_mask_hi,
5039                                 &xmm_mask_lo, &xmm_mask_hi);
5040
5041             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
5042                                 &xmm_dst_lo, &xmm_dst_hi,
5043                                 &xmm_dst_lo, &xmm_dst_hi);
5044
5045             save_128_aligned (
5046                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5047
5048             mask += 16;
5049             dst += 16;
5050             w -= 16;
5051         }
5052
5053         while (w)
5054         {
5055             m = (uint32_t) *mask++;
5056             d = (uint32_t) *dst;
5057
5058             *dst++ = (uint8_t) pack_1x64_32 (
5059                 pix_multiply_1x64 (
5060                     pix_multiply_1x64 (
5061                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5062                     unpack_32_1x64 (d)));
5063             w--;
5064         }
5065     }
5066
5067     _mm_empty ();
5068 }
5069
5070 /* -----------------------------------------------------------------------
5071  * composite_in_n_8
5072  */
5073
5074 static void
5075 sse2_composite_in_n_8 (pixman_implementation_t *imp,
5076                        pixman_op_t              op,
5077                        pixman_image_t *         src_image,
5078                        pixman_image_t *         mask_image,
5079                        pixman_image_t *         dst_image,
5080                        int32_t                  src_x,
5081                        int32_t                  src_y,
5082                        int32_t                  mask_x,
5083                        int32_t                  mask_y,
5084                        int32_t                  dest_x,
5085                        int32_t                  dest_y,
5086                        int32_t                  width,
5087                        int32_t                  height)
5088 {
5089     uint8_t     *dst_line, *dst;
5090     int dst_stride;
5091     uint32_t d;
5092     uint32_t src;
5093     int32_t w;
5094
5095     __m128i xmm_alpha;
5096     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5097
5098     PIXMAN_IMAGE_GET_LINE (
5099         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5100
5101     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5102
5103     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5104
5105     src = src >> 24;
5106
5107     if (src == 0xff)
5108         return;
5109
5110     if (src == 0x00)
5111     {
5112         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
5113                      8, dest_x, dest_y, width, height, src);
5114
5115         return;
5116     }
5117
5118     while (height--)
5119     {
5120         dst = dst_line;
5121         dst_line += dst_stride;
5122         w = width;
5123
5124         /* call prefetch hint to optimize cache load*/
5125         cache_prefetch ((__m128i*)dst);
5126
5127         while (w && ((unsigned long)dst & 15))
5128         {
5129             d = (uint32_t) *dst;
5130
5131             *dst++ = (uint8_t) pack_1x64_32 (
5132                 pix_multiply_1x64 (
5133                     _mm_movepi64_pi64 (xmm_alpha),
5134                     unpack_32_1x64 (d)));
5135             w--;
5136         }
5137
5138         /* call prefetch hint to optimize cache load*/
5139         cache_prefetch ((__m128i*)dst);
5140
5141         while (w >= 16)
5142         {
5143             /* fill cache line with next memory */
5144             cache_prefetch_next ((__m128i*)dst);
5145
5146             xmm_dst = load_128_aligned ((__m128i*)dst);
5147
5148             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5149             
5150             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5151                                 &xmm_dst_lo, &xmm_dst_hi,
5152                                 &xmm_dst_lo, &xmm_dst_hi);
5153
5154             save_128_aligned (
5155                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5156
5157             dst += 16;
5158             w -= 16;
5159         }
5160
5161         while (w)
5162         {
5163             d = (uint32_t) *dst;
5164
5165             *dst++ = (uint8_t) pack_1x64_32 (
5166                 pix_multiply_1x64 (
5167                     _mm_movepi64_pi64 (xmm_alpha),
5168                     unpack_32_1x64 (d)));
5169             w--;
5170         }
5171     }
5172
5173     _mm_empty ();
5174 }
5175
5176 /* ---------------------------------------------------------------------------
5177  * composite_in_8_8
5178  */
5179
5180 static void
5181 sse2_composite_in_8_8 (pixman_implementation_t *imp,
5182                        pixman_op_t              op,
5183                        pixman_image_t *         src_image,
5184                        pixman_image_t *         mask_image,
5185                        pixman_image_t *         dst_image,
5186                        int32_t                  src_x,
5187                        int32_t                  src_y,
5188                        int32_t                  mask_x,
5189                        int32_t                  mask_y,
5190                        int32_t                  dest_x,
5191                        int32_t                  dest_y,
5192                        int32_t                  width,
5193                        int32_t                  height)
5194 {
5195     uint8_t     *dst_line, *dst;
5196     uint8_t     *src_line, *src;
5197     int src_stride, dst_stride;
5198     int32_t w;
5199     uint32_t s, d;
5200
5201     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5202     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5203
5204     PIXMAN_IMAGE_GET_LINE (
5205         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5206     PIXMAN_IMAGE_GET_LINE (
5207         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5208
5209     while (height--)
5210     {
5211         dst = dst_line;
5212         dst_line += dst_stride;
5213         src = src_line;
5214         src_line += src_stride;
5215         w = width;
5216
5217         /* call prefetch hint to optimize cache load*/
5218         cache_prefetch ((__m128i*)src);
5219         cache_prefetch ((__m128i*)dst);
5220
5221         while (w && ((unsigned long)dst & 15))
5222         {
5223             s = (uint32_t) *src++;
5224             d = (uint32_t) *dst;
5225
5226             *dst++ = (uint8_t) pack_1x64_32 (
5227                 pix_multiply_1x64 (
5228                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
5229             w--;
5230         }
5231
5232         /* call prefetch hint to optimize cache load*/
5233         cache_prefetch ((__m128i*)src);
5234         cache_prefetch ((__m128i*)dst);
5235
5236         while (w >= 16)
5237         {
5238             /* fill cache line with next memory */
5239             cache_prefetch_next ((__m128i*)src);
5240             cache_prefetch_next ((__m128i*)dst);
5241
5242             xmm_src = load_128_unaligned ((__m128i*)src);
5243             xmm_dst = load_128_aligned ((__m128i*)dst);
5244
5245             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5246             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5247
5248             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5249                                 &xmm_dst_lo, &xmm_dst_hi,
5250                                 &xmm_dst_lo, &xmm_dst_hi);
5251
5252             save_128_aligned (
5253                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5254
5255             src += 16;
5256             dst += 16;
5257             w -= 16;
5258         }
5259
5260         while (w)
5261         {
5262             s = (uint32_t) *src++;
5263             d = (uint32_t) *dst;
5264
5265             *dst++ = (uint8_t) pack_1x64_32 (
5266                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5267             w--;
5268         }
5269     }
5270
5271     _mm_empty ();
5272 }
5273
5274 /* -------------------------------------------------------------------------
5275  * composite_add_n_8_8
5276  */
5277
5278 static void
5279 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
5280                           pixman_op_t              op,
5281                           pixman_image_t *         src_image,
5282                           pixman_image_t *         mask_image,
5283                           pixman_image_t *         dst_image,
5284                           int32_t                  src_x,
5285                           int32_t                  src_y,
5286                           int32_t                  mask_x,
5287                           int32_t                  mask_y,
5288                           int32_t                  dest_x,
5289                           int32_t                  dest_y,
5290                           int32_t                  width,
5291                           int32_t                  height)
5292 {
5293     uint8_t     *dst_line, *dst;
5294     uint8_t     *mask_line, *mask;
5295     int dst_stride, mask_stride;
5296     int32_t w;
5297     uint32_t src;
5298     uint8_t sa;
5299     uint32_t m, d;
5300
5301     __m128i xmm_alpha;
5302     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5303     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5304
5305     PIXMAN_IMAGE_GET_LINE (
5306         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5307     PIXMAN_IMAGE_GET_LINE (
5308         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5309
5310     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5311
5312     sa = src >> 24;
5313
5314     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5315
5316     while (height--)
5317     {
5318         dst = dst_line;
5319         dst_line += dst_stride;
5320         mask = mask_line;
5321         mask_line += mask_stride;
5322         w = width;
5323
5324         /* call prefetch hint to optimize cache load*/
5325         cache_prefetch ((__m128i*)mask);
5326         cache_prefetch ((__m128i*)dst);
5327
5328         while (w && ((unsigned long)dst & 15))
5329         {
5330             m = (uint32_t) *mask++;
5331             d = (uint32_t) *dst;
5332
5333             *dst++ = (uint8_t) pack_1x64_32 (
5334                 _mm_adds_pu16 (
5335                     pix_multiply_1x64 (
5336                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5337                     unpack_32_1x64 (d)));
5338             w--;
5339         }
5340
5341         /* call prefetch hint to optimize cache load*/
5342         cache_prefetch ((__m128i*)mask);
5343         cache_prefetch ((__m128i*)dst);
5344
5345         while (w >= 16)
5346         {
5347             /* fill cache line with next memory */
5348             cache_prefetch_next ((__m128i*)mask);
5349             cache_prefetch_next ((__m128i*)dst);
5350
5351             xmm_mask = load_128_unaligned ((__m128i*)mask);
5352             xmm_dst = load_128_aligned ((__m128i*)dst);
5353
5354             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5355             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5356
5357             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5358                                 &xmm_mask_lo, &xmm_mask_hi,
5359                                 &xmm_mask_lo, &xmm_mask_hi);
5360
5361             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5362             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5363
5364             save_128_aligned (
5365                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5366
5367             mask += 16;
5368             dst += 16;
5369             w -= 16;
5370         }
5371
5372         while (w)
5373         {
5374             m = (uint32_t) *mask++;
5375             d = (uint32_t) *dst;
5376
5377             *dst++ = (uint8_t) pack_1x64_32 (
5378                 _mm_adds_pu16 (
5379                     pix_multiply_1x64 (
5380                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5381                     unpack_32_1x64 (d)));
5382
5383             w--;
5384         }
5385     }
5386
5387     _mm_empty ();
5388 }
5389
5390 /* -------------------------------------------------------------------------
5391  * composite_add_n_8_8
5392  */
5393
5394 static void
5395 sse2_composite_add_n_8 (pixman_implementation_t *imp,
5396                         pixman_op_t              op,
5397                         pixman_image_t *         src_image,
5398                         pixman_image_t *         mask_image,
5399                         pixman_image_t *         dst_image,
5400                         int32_t                  src_x,
5401                         int32_t                  src_y,
5402                         int32_t                  mask_x,
5403                         int32_t                  mask_y,
5404                         int32_t                  dest_x,
5405                         int32_t                  dest_y,
5406                         int32_t                  width,
5407                         int32_t                  height)
5408 {
5409     uint8_t     *dst_line, *dst;
5410     int dst_stride;
5411     int32_t w;
5412     uint32_t src;
5413
5414     __m128i xmm_src;
5415
5416     PIXMAN_IMAGE_GET_LINE (
5417         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5418
5419     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5420
5421     src >>= 24;
5422
5423     if (src == 0x00)
5424         return;
5425
5426     if (src == 0xff)
5427     {
5428         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
5429                      8, dest_x, dest_y, width, height, 0xff);
5430
5431         return;
5432     }
5433
5434     src = (src << 24) | (src << 16) | (src << 8) | src;
5435     xmm_src = _mm_set_epi32 (src, src, src, src);
5436
5437     while (height--)
5438     {
5439         dst = dst_line;
5440         dst_line += dst_stride;
5441         w = width;
5442
5443         /* call prefetch hint to optimize cache load*/
5444         cache_prefetch ((__m128i*)dst);
5445
5446         while (w && ((unsigned long)dst & 15))
5447         {
5448             *dst = (uint8_t)_mm_cvtsi64_si32 (
5449                 _mm_adds_pu8 (
5450                     _mm_movepi64_pi64 (xmm_src),
5451                     _mm_cvtsi32_si64 (*dst)));
5452
5453             w--;
5454             dst++;
5455         }
5456
5457         /* call prefetch hint to optimize cache load*/
5458         cache_prefetch ((__m128i*)dst);
5459
5460         while (w >= 16)
5461         {
5462             /* fill cache line with next memory */
5463             cache_prefetch_next ((__m128i*)dst);
5464
5465             save_128_aligned (
5466                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
5467
5468             dst += 16;
5469             w -= 16;
5470         }
5471
5472         while (w)
5473         {
5474             *dst = (uint8_t)_mm_cvtsi64_si32 (
5475                 _mm_adds_pu8 (
5476                     _mm_movepi64_pi64 (xmm_src),
5477                     _mm_cvtsi32_si64 (*dst)));
5478
5479             w--;
5480             dst++;
5481         }
5482     }
5483
5484     _mm_empty ();
5485 }
5486
5487 /* ----------------------------------------------------------------------
5488  * composite_add_8000_8000
5489  */
5490
5491 static void
5492 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5493                               pixman_op_t              op,
5494                               pixman_image_t *         src_image,
5495                               pixman_image_t *         mask_image,
5496                               pixman_image_t *         dst_image,
5497                               int32_t                  src_x,
5498                               int32_t                  src_y,
5499                               int32_t                  mask_x,
5500                               int32_t                  mask_y,
5501                               int32_t                  dest_x,
5502                               int32_t                  dest_y,
5503                               int32_t                  width,
5504                               int32_t                  height)
5505 {
5506     uint8_t     *dst_line, *dst;
5507     uint8_t     *src_line, *src;
5508     int dst_stride, src_stride;
5509     int32_t w;
5510     uint16_t t;
5511
5512     PIXMAN_IMAGE_GET_LINE (
5513         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5514     PIXMAN_IMAGE_GET_LINE (
5515         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5516
5517     while (height--)
5518     {
5519         dst = dst_line;
5520         src = src_line;
5521
5522         /* call prefetch hint to optimize cache load*/
5523         cache_prefetch ((__m128i*)src);
5524         cache_prefetch ((__m128i*)dst);
5525
5526         dst_line += dst_stride;
5527         src_line += src_stride;
5528         w = width;
5529
5530         /* Small head */
5531         while (w && (unsigned long)dst & 3)
5532         {
5533             t = (*dst) + (*src++);
5534             *dst++ = t | (0 - (t >> 8));
5535             w--;
5536         }
5537
5538         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5539
5540         /* Small tail */
5541         dst += w & 0xfffc;
5542         src += w & 0xfffc;
5543
5544         w &= 3;
5545
5546         while (w)
5547         {
5548             t = (*dst) + (*src++);
5549             *dst++ = t | (0 - (t >> 8));
5550             w--;
5551         }
5552     }
5553
5554     _mm_empty ();
5555 }
5556
5557 /* ---------------------------------------------------------------------
5558  * composite_add_8888_8888
5559  */
5560 static void
5561 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5562                               pixman_op_t              op,
5563                               pixman_image_t *         src_image,
5564                               pixman_image_t *         mask_image,
5565                               pixman_image_t *         dst_image,
5566                               int32_t                  src_x,
5567                               int32_t                  src_y,
5568                               int32_t                  mask_x,
5569                               int32_t                  mask_y,
5570                               int32_t                  dest_x,
5571                               int32_t                  dest_y,
5572                               int32_t                  width,
5573                               int32_t                  height)
5574 {
5575     uint32_t    *dst_line, *dst;
5576     uint32_t    *src_line, *src;
5577     int dst_stride, src_stride;
5578
5579     PIXMAN_IMAGE_GET_LINE (
5580         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5581     PIXMAN_IMAGE_GET_LINE (
5582         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5583
5584     while (height--)
5585     {
5586         dst = dst_line;
5587         dst_line += dst_stride;
5588         src = src_line;
5589         src_line += src_stride;
5590
5591         core_combine_add_u_sse2 (dst, src, NULL, width);
5592     }
5593
5594     _mm_empty ();
5595 }
5596
5597 /* -------------------------------------------------------------------------------------------------
5598  * sse2_composite_copy_area
5599  */
5600
5601 static pixman_bool_t
5602 pixman_blt_sse2 (uint32_t *src_bits,
5603                  uint32_t *dst_bits,
5604                  int       src_stride,
5605                  int       dst_stride,
5606                  int       src_bpp,
5607                  int       dst_bpp,
5608                  int       src_x,
5609                  int       src_y,
5610                  int       dst_x,
5611                  int       dst_y,
5612                  int       width,
5613                  int       height)
5614 {
5615     uint8_t *   src_bytes;
5616     uint8_t *   dst_bytes;
5617     int byte_width;
5618
5619     if (src_bpp != dst_bpp)
5620         return FALSE;
5621
5622     if (src_bpp == 16)
5623     {
5624         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5625         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5626         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5627         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5628         byte_width = 2 * width;
5629         src_stride *= 2;
5630         dst_stride *= 2;
5631     }
5632     else if (src_bpp == 32)
5633     {
5634         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5635         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5636         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5637         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5638         byte_width = 4 * width;
5639         src_stride *= 4;
5640         dst_stride *= 4;
5641     }
5642     else
5643     {
5644         return FALSE;
5645     }
5646
5647     cache_prefetch ((__m128i*)src_bytes);
5648     cache_prefetch ((__m128i*)dst_bytes);
5649
5650     while (height--)
5651     {
5652         int w;
5653         uint8_t *s = src_bytes;
5654         uint8_t *d = dst_bytes;
5655         src_bytes += src_stride;
5656         dst_bytes += dst_stride;
5657         w = byte_width;
5658
5659         cache_prefetch_next ((__m128i*)s);
5660         cache_prefetch_next ((__m128i*)d);
5661
5662         while (w >= 2 && ((unsigned long)d & 3))
5663         {
5664             *(uint16_t *)d = *(uint16_t *)s;
5665             w -= 2;
5666             s += 2;
5667             d += 2;
5668         }
5669
5670         while (w >= 4 && ((unsigned long)d & 15))
5671         {
5672             *(uint32_t *)d = *(uint32_t *)s;
5673
5674             w -= 4;
5675             s += 4;
5676             d += 4;
5677         }
5678
5679         cache_prefetch_next ((__m128i*)s);
5680         cache_prefetch_next ((__m128i*)d);
5681
5682         while (w >= 64)
5683         {
5684             __m128i xmm0, xmm1, xmm2, xmm3;
5685
5686             /* 128 bytes ahead */
5687             cache_prefetch (((__m128i*)s) + 8);
5688             cache_prefetch (((__m128i*)d) + 8);
5689
5690             xmm0 = load_128_unaligned ((__m128i*)(s));
5691             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5692             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5693             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5694
5695             save_128_aligned ((__m128i*)(d),    xmm0);
5696             save_128_aligned ((__m128i*)(d + 16), xmm1);
5697             save_128_aligned ((__m128i*)(d + 32), xmm2);
5698             save_128_aligned ((__m128i*)(d + 48), xmm3);
5699
5700             s += 64;
5701             d += 64;
5702             w -= 64;
5703         }
5704
5705         cache_prefetch_next ((__m128i*)s);
5706         cache_prefetch_next ((__m128i*)d);
5707
5708         while (w >= 16)
5709         {
5710             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5711
5712             w -= 16;
5713             d += 16;
5714             s += 16;
5715         }
5716
5717         cache_prefetch_next ((__m128i*)s);
5718         cache_prefetch_next ((__m128i*)d);
5719
5720         while (w >= 4)
5721         {
5722             *(uint32_t *)d = *(uint32_t *)s;
5723
5724             w -= 4;
5725             s += 4;
5726             d += 4;
5727         }
5728
5729         if (w >= 2)
5730         {
5731             *(uint16_t *)d = *(uint16_t *)s;
5732             w -= 2;
5733             s += 2;
5734             d += 2;
5735         }
5736     }
5737
5738     _mm_empty ();
5739
5740     return TRUE;
5741 }
5742
5743 static void
5744 sse2_composite_copy_area (pixman_implementation_t *imp,
5745                           pixman_op_t              op,
5746                           pixman_image_t *         src_image,
5747                           pixman_image_t *         mask_image,
5748                           pixman_image_t *         dst_image,
5749                           int32_t                  src_x,
5750                           int32_t                  src_y,
5751                           int32_t                  mask_x,
5752                           int32_t                  mask_y,
5753                           int32_t                  dest_x,
5754                           int32_t                  dest_y,
5755                           int32_t                  width,
5756                           int32_t                  height)
5757 {
5758     pixman_blt_sse2 (src_image->bits.bits,
5759                      dst_image->bits.bits,
5760                      src_image->bits.rowstride,
5761                      dst_image->bits.rowstride,
5762                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5763                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5764                      src_x, src_y, dest_x, dest_y, width, height);
5765 }
5766
5767 static void
5768 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5769                                  pixman_op_t              op,
5770                                  pixman_image_t *         src_image,
5771                                  pixman_image_t *         mask_image,
5772                                  pixman_image_t *         dst_image,
5773                                  int32_t                  src_x,
5774                                  int32_t                  src_y,
5775                                  int32_t                  mask_x,
5776                                  int32_t                  mask_y,
5777                                  int32_t                  dest_x,
5778                                  int32_t                  dest_y,
5779                                  int32_t                  width,
5780                                  int32_t                  height)
5781 {
5782     uint32_t    *src, *src_line, s;
5783     uint32_t    *dst, *dst_line, d;
5784     uint8_t         *mask, *mask_line;
5785     uint32_t m;
5786     int src_stride, mask_stride, dst_stride;
5787     int32_t w;
5788     __m64 ms;
5789
5790     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5791     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5792     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5793
5794     PIXMAN_IMAGE_GET_LINE (
5795         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5796     PIXMAN_IMAGE_GET_LINE (
5797         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5798     PIXMAN_IMAGE_GET_LINE (
5799         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5800
5801     while (height--)
5802     {
5803         src = src_line;
5804         src_line += src_stride;
5805         dst = dst_line;
5806         dst_line += dst_stride;
5807         mask = mask_line;
5808         mask_line += mask_stride;
5809
5810         w = width;
5811
5812         /* call prefetch hint to optimize cache load*/
5813         cache_prefetch ((__m128i*)src);
5814         cache_prefetch ((__m128i*)dst);
5815         cache_prefetch ((__m128i*)mask);
5816
5817         while (w && (unsigned long)dst & 15)
5818         {
5819             s = 0xff000000 | *src++;
5820             m = (uint32_t) *mask++;
5821             d = *dst;
5822             ms = unpack_32_1x64 (s);
5823
5824             if (m != 0xff)
5825             {
5826                 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5827                 __m64 md = unpack_32_1x64 (d);
5828
5829                 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5830             }
5831
5832             *dst++ = pack_1x64_32 (ms);
5833             w--;
5834         }
5835
5836         /* call prefetch hint to optimize cache load*/
5837         cache_prefetch ((__m128i*)src);
5838         cache_prefetch ((__m128i*)dst);
5839         cache_prefetch ((__m128i*)mask);
5840
5841         while (w >= 4)
5842         {
5843             /* fill cache line with next memory */
5844             cache_prefetch_next ((__m128i*)src);
5845             cache_prefetch_next ((__m128i*)dst);
5846             cache_prefetch_next ((__m128i*)mask);
5847
5848             m = *(uint32_t*) mask;
5849             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5850
5851             if (m == 0xffffffff)
5852             {
5853                 save_128_aligned ((__m128i*)dst, xmm_src);
5854             }
5855             else
5856             {
5857                 xmm_dst = load_128_aligned ((__m128i*)dst);
5858
5859                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5860
5861                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5862                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5863                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5864
5865                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5866
5867                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5868
5869                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5870             }
5871
5872             src += 4;
5873             dst += 4;
5874             mask += 4;
5875             w -= 4;
5876         }
5877
5878         while (w)
5879         {
5880             m = (uint32_t) *mask++;
5881
5882             if (m)
5883             {
5884                 s = 0xff000000 | *src;
5885
5886                 if (m == 0xff)
5887                 {
5888                     *dst = s;
5889                 }
5890                 else
5891                 {
5892                     __m64 ma, md, ms;
5893
5894                     d = *dst;
5895
5896                     ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5897                     md = unpack_32_1x64 (d);
5898                     ms = unpack_32_1x64 (s);
5899
5900                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5901                 }
5902
5903             }
5904
5905             src++;
5906             dst++;
5907             w--;
5908         }
5909     }
5910
5911     _mm_empty ();
5912 }
5913
5914 static void
5915 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5916                                  pixman_op_t              op,
5917                                  pixman_image_t *         src_image,
5918                                  pixman_image_t *         mask_image,
5919                                  pixman_image_t *         dst_image,
5920                                  int32_t                  src_x,
5921                                  int32_t                  src_y,
5922                                  int32_t                  mask_x,
5923                                  int32_t                  mask_y,
5924                                  int32_t                  dest_x,
5925                                  int32_t                  dest_y,
5926                                  int32_t                  width,
5927                                  int32_t                  height)
5928 {
5929     uint32_t    *src, *src_line, s;
5930     uint32_t    *dst, *dst_line, d;
5931     uint8_t         *mask, *mask_line;
5932     uint32_t m;
5933     int src_stride, mask_stride, dst_stride;
5934     int32_t w;
5935
5936     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5937     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5938     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5939
5940     PIXMAN_IMAGE_GET_LINE (
5941         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5942     PIXMAN_IMAGE_GET_LINE (
5943         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5944     PIXMAN_IMAGE_GET_LINE (
5945         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5946
5947     while (height--)
5948     {
5949         src = src_line;
5950         src_line += src_stride;
5951         dst = dst_line;
5952         dst_line += dst_stride;
5953         mask = mask_line;
5954         mask_line += mask_stride;
5955
5956         w = width;
5957
5958         /* call prefetch hint to optimize cache load*/
5959         cache_prefetch ((__m128i *)src);
5960         cache_prefetch ((__m128i *)dst);
5961         cache_prefetch ((__m128i *)mask);
5962
5963         while (w && (unsigned long)dst & 15)
5964         {
5965             uint32_t sa;
5966
5967             s = *src++;
5968             m = (uint32_t) *mask++;
5969             d = *dst;
5970
5971             sa = s >> 24;
5972
5973             if (m)
5974             {
5975                 if (sa == 0xff && m == 0xff)
5976                 {
5977                     *dst = s;
5978                 }
5979                 else
5980                 {
5981                     __m64 ms, md, ma, msa;
5982
5983                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5984                     ms = unpack_32_1x64 (s);
5985                     md = unpack_32_1x64 (d);
5986
5987                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5988
5989                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5990                 }
5991             }
5992
5993             dst++;
5994             w--;
5995         }
5996
5997         /* call prefetch hint to optimize cache load*/
5998         cache_prefetch ((__m128i *)src);
5999         cache_prefetch ((__m128i *)dst);
6000         cache_prefetch ((__m128i *)mask);
6001
6002         while (w >= 4)
6003         {
6004             /* fill cache line with next memory */
6005             cache_prefetch_next ((__m128i *)src);
6006             cache_prefetch_next ((__m128i *)dst);
6007             cache_prefetch_next ((__m128i *)mask);
6008
6009             m = *(uint32_t *) mask;
6010
6011             if (m)
6012             {
6013                 xmm_src = load_128_unaligned ((__m128i*)src);
6014
6015                 if (m == 0xffffffff && is_opaque (xmm_src))
6016                 {
6017                     save_128_aligned ((__m128i *)dst, xmm_src);
6018                 }
6019                 else
6020                 {
6021                     xmm_dst = load_128_aligned ((__m128i *)dst);
6022
6023                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
6024
6025                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6026                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
6027                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6028
6029                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
6030                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
6031
6032                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
6033                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
6034
6035                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6036                 }
6037             }
6038
6039             src += 4;
6040             dst += 4;
6041             mask += 4;
6042             w -= 4;
6043         }
6044
6045         while (w)
6046         {
6047             uint32_t sa;
6048
6049             s = *src++;
6050             m = (uint32_t) *mask++;
6051             d = *dst;
6052
6053             sa = s >> 24;
6054
6055             if (m)
6056             {
6057                 if (sa == 0xff && m == 0xff)
6058                 {
6059                     *dst = s;
6060                 }
6061                 else
6062                 {
6063                     __m64 ms, md, ma, msa;
6064
6065                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
6066                     ms = unpack_32_1x64 (s);
6067                     md = unpack_32_1x64 (d);
6068
6069                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
6070
6071                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
6072                 }
6073             }
6074
6075             dst++;
6076             w--;
6077         }
6078     }
6079
6080     _mm_empty ();
6081 }
6082
6083 static void
6084 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
6085                                     pixman_op_t              op,
6086                                     pixman_image_t *         src_image,
6087                                     pixman_image_t *         mask_image,
6088                                     pixman_image_t *         dst_image,
6089                                     int32_t                  src_x,
6090                                     int32_t                  src_y,
6091                                     int32_t                  mask_x,
6092                                     int32_t                  mask_y,
6093                                     int32_t                  dest_x,
6094                                     int32_t                  dest_y,
6095                                     int32_t                  width,
6096                                     int32_t                  height)
6097 {
6098     uint32_t src;
6099     uint32_t    *dst_line, *dst;
6100     __m128i xmm_src;
6101     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6102     __m128i xmm_dsta_hi, xmm_dsta_lo;
6103     int dst_stride;
6104     int32_t w;
6105
6106     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
6107
6108     if (src == 0)
6109         return;
6110
6111     PIXMAN_IMAGE_GET_LINE (
6112         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
6113
6114     xmm_src = expand_pixel_32_1x128 (src);
6115
6116     while (height--)
6117     {
6118         dst = dst_line;
6119
6120         /* call prefetch hint to optimize cache load*/
6121         cache_prefetch ((__m128i*)dst);
6122
6123         dst_line += dst_stride;
6124         w = width;
6125
6126         while (w && (unsigned long)dst & 15)
6127         {
6128             __m64 vd;
6129
6130             vd = unpack_32_1x64 (*dst);
6131
6132             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
6133                                             _mm_movepi64_pi64 (xmm_src)));
6134             w--;
6135             dst++;
6136         }
6137
6138         cache_prefetch ((__m128i*)dst);
6139
6140         while (w >= 4)
6141         {
6142             __m128i tmp_lo, tmp_hi;
6143
6144             /* fill cache line with next memory */
6145             cache_prefetch_next ((__m128i*)(dst + 4));
6146
6147             xmm_dst = load_128_aligned ((__m128i*)dst);
6148
6149             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6150             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
6151
6152             tmp_lo = xmm_src;
6153             tmp_hi = xmm_src;
6154
6155             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
6156                         &xmm_dsta_lo, &xmm_dsta_hi,
6157                         &tmp_lo, &tmp_hi);
6158
6159             save_128_aligned (
6160                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
6161
6162             w -= 4;
6163             dst += 4;
6164         }
6165
6166         while (w)
6167         {
6168             __m64 vd;
6169
6170             vd = unpack_32_1x64 (*dst);
6171
6172             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
6173                                             _mm_movepi64_pi64 (xmm_src)));
6174             w--;
6175             dst++;
6176         }
6177
6178     }
6179
6180     _mm_empty ();
6181 }
6182
6183 static void
6184 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
6185                                     pixman_op_t              op,
6186                                     pixman_image_t *         src_image,
6187                                     pixman_image_t *         mask_image,
6188                                     pixman_image_t *         dst_image,
6189                                     int32_t                  src_x,
6190                                     int32_t                  src_y,
6191                                     int32_t                  mask_x,
6192                                     int32_t                  mask_y,
6193                                     int32_t                  dest_x,
6194                                     int32_t                  dest_y,
6195                                     int32_t                  width,
6196                                     int32_t                  height)
6197 {
6198     uint32_t    *src, *src_line, s;
6199     uint32_t    *dst, *dst_line, d;
6200     uint32_t    *mask, *mask_line;
6201     uint32_t    m;
6202     int src_stride, mask_stride, dst_stride;
6203     int32_t w;
6204
6205     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
6206     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6207     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
6208
6209     PIXMAN_IMAGE_GET_LINE (
6210         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
6211     PIXMAN_IMAGE_GET_LINE (
6212         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
6213     PIXMAN_IMAGE_GET_LINE (
6214         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
6215
6216     while (height--)
6217     {
6218         src = src_line;
6219         src_line += src_stride;
6220         dst = dst_line;
6221         dst_line += dst_stride;
6222         mask = mask_line;
6223         mask_line += mask_stride;
6224
6225         w = width;
6226
6227         /* call prefetch hint to optimize cache load*/
6228         cache_prefetch ((__m128i *)src);
6229         cache_prefetch ((__m128i *)dst);
6230         cache_prefetch ((__m128i *)mask);
6231
6232         while (w && (unsigned long)dst & 15)
6233         {
6234             uint32_t sa;
6235
6236             s = *src++;
6237             m = (*mask++) >> 24;
6238             d = *dst;
6239
6240             sa = s >> 24;
6241
6242             if (m)
6243             {
6244                 if (sa == 0xff && m == 0xff)
6245                 {
6246                     *dst = s;
6247                 }
6248                 else
6249                 {
6250                     __m64 ms, md, ma, msa;
6251
6252                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
6253                     ms = unpack_32_1x64 (s);
6254                     md = unpack_32_1x64 (d);
6255
6256                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
6257
6258                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
6259                 }
6260             }
6261
6262             dst++;
6263             w--;
6264         }
6265
6266         /* call prefetch hint to optimize cache load*/
6267         cache_prefetch ((__m128i *)src);
6268         cache_prefetch ((__m128i *)dst);
6269         cache_prefetch ((__m128i *)mask);
6270
6271         while (w >= 4)
6272         {
6273             /* fill cache line with next memory */
6274             cache_prefetch_next ((__m128i *)src);
6275             cache_prefetch_next ((__m128i *)dst);
6276             cache_prefetch_next ((__m128i *)mask);
6277
6278             xmm_mask = load_128_unaligned ((__m128i*)mask);
6279
6280             if (!is_transparent (xmm_mask))
6281             {
6282                 xmm_src = load_128_unaligned ((__m128i*)src);
6283
6284                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
6285                 {
6286                     save_128_aligned ((__m128i *)dst, xmm_src);
6287                 }
6288                 else
6289                 {
6290                     xmm_dst = load_128_aligned ((__m128i *)dst);
6291
6292                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6293                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
6294                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6295
6296                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
6297                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
6298
6299                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
6300                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
6301
6302                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6303                 }
6304             }
6305
6306             src += 4;
6307             dst += 4;
6308             mask += 4;
6309             w -= 4;
6310         }
6311
6312         while (w)
6313         {
6314             uint32_t sa;
6315
6316             s = *src++;
6317             m = (*mask++) >> 24;
6318             d = *dst;
6319
6320             sa = s >> 24;
6321
6322             if (m)
6323             {
6324                 if (sa == 0xff && m == 0xff)
6325                 {
6326                     *dst = s;
6327                 }
6328                 else
6329                 {
6330                     __m64 ms, md, ma, msa;
6331
6332                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
6333                     ms = unpack_32_1x64 (s);
6334                     md = unpack_32_1x64 (d);
6335
6336                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
6337
6338                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
6339                 }
6340             }
6341
6342             dst++;
6343             w--;
6344         }
6345     }
6346
6347     _mm_empty ();
6348 }
6349
6350 /* A variant of 'core_combine_over_u_sse2' with minor tweaks */
6351 static force_inline void
6352 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
6353                                              const uint32_t* ps,
6354                                              int32_t         w,
6355                                              pixman_fixed_t  vx,
6356                                              pixman_fixed_t  unit_x,
6357                                              pixman_fixed_t  max_vx)
6358 {
6359     uint32_t s, d;
6360     const uint32_t* pm = NULL;
6361
6362     __m128i xmm_dst_lo, xmm_dst_hi;
6363     __m128i xmm_src_lo, xmm_src_hi;
6364     __m128i xmm_alpha_lo, xmm_alpha_hi;
6365
6366     /* Align dst on a 16-byte boundary */
6367     while (w && ((unsigned long)pd & 15))
6368     {
6369         d = *pd;
6370         s = combine1 (ps + (vx >> 16), pm);
6371         vx += unit_x;
6372
6373         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
6374         if (pm)
6375             pm++;
6376         w--;
6377     }
6378
6379     while (w >= 4)
6380     {
6381         __m128i tmp;
6382         uint32_t tmp1, tmp2, tmp3, tmp4;
6383
6384         tmp1 = ps[vx >> 16];
6385         vx += unit_x;
6386         tmp2 = ps[vx >> 16];
6387         vx += unit_x;
6388         tmp3 = ps[vx >> 16];
6389         vx += unit_x;
6390         tmp4 = ps[vx >> 16];
6391         vx += unit_x;
6392
6393         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
6394
6395         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
6396
6397         if (is_opaque (xmm_src_hi))
6398         {
6399             save_128_aligned ((__m128i*)pd, xmm_src_hi);
6400         }
6401         else if (!is_zero (xmm_src_hi))
6402         {
6403             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
6404
6405             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
6406             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
6407
6408             expand_alpha_2x128 (
6409                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
6410
6411             over_2x128 (&xmm_src_lo, &xmm_src_hi,
6412                         &xmm_alpha_lo, &xmm_alpha_hi,
6413                         &xmm_dst_lo, &xmm_dst_hi);
6414
6415             /* rebuid the 4 pixel data and save*/
6416             save_128_aligned ((__m128i*)pd,
6417                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6418         }
6419
6420         w -= 4;
6421         pd += 4;
6422         if (pm)
6423             pm += 4;
6424     }
6425
6426     while (w)
6427     {
6428         d = *pd;
6429         s = combine1 (ps + (vx >> 16), pm);
6430         vx += unit_x;
6431
6432         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
6433         if (pm)
6434             pm++;
6435
6436         w--;
6437     }
6438     _mm_empty ();
6439 }
6440
6441 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
6442                        scaled_nearest_scanline_sse2_8888_8888_OVER,
6443                        uint32_t, uint32_t, COVER);
6444 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
6445                        scaled_nearest_scanline_sse2_8888_8888_OVER,
6446                        uint32_t, uint32_t, NONE);
6447 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
6448                        scaled_nearest_scanline_sse2_8888_8888_OVER,
6449                        uint32_t, uint32_t, PAD);
6450
6451 static const pixman_fast_path_t sse2_fast_paths[] =
6452 {
6453     /* PIXMAN_OP_OVER */
6454     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6455     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6456     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6457     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6458     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6459     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6460     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6461     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6462     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6463     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6464     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6465     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6466     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6467     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6468     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6469     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6470     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6471     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6472     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6473     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6474     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6475     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6476     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6477     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6478     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6479     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6480     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6481     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6482     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6483     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6484     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6485     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6486     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6487     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6488     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6489     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6490     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6491     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6492     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6493     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6494     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6495     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6496     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6497     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6498     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6499     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6500     
6501     /* PIXMAN_OP_OVER_REVERSE */
6502     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6503     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6504
6505     /* PIXMAN_OP_ADD */
6506     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6507     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
6508     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6509     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6510     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6511     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6512
6513     /* PIXMAN_OP_SRC */
6514     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6515     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6516     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6517     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6518     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6519     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6520     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6521     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6522     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6523     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6524     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6525     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6526     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6527     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6528
6529     /* PIXMAN_OP_IN */
6530     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6531     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6532     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6533
6534     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6535     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6536     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6537     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6538     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6539     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6540     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6541     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6542     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6543     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6544     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6545     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6546
6547     { PIXMAN_OP_NONE },
6548 };
6549
6550 static pixman_bool_t
6551 sse2_blt (pixman_implementation_t *imp,
6552           uint32_t *               src_bits,
6553           uint32_t *               dst_bits,
6554           int                      src_stride,
6555           int                      dst_stride,
6556           int                      src_bpp,
6557           int                      dst_bpp,
6558           int                      src_x,
6559           int                      src_y,
6560           int                      dst_x,
6561           int                      dst_y,
6562           int                      width,
6563           int                      height)
6564 {
6565     if (!pixman_blt_sse2 (
6566             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6567             src_x, src_y, dst_x, dst_y, width, height))
6568
6569     {
6570         return _pixman_implementation_blt (
6571             imp->delegate,
6572             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6573             src_x, src_y, dst_x, dst_y, width, height);
6574     }
6575
6576     return TRUE;
6577 }
6578
6579 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6580 __attribute__((__force_align_arg_pointer__))
6581 #endif
6582 static pixman_bool_t
6583 sse2_fill (pixman_implementation_t *imp,
6584            uint32_t *               bits,
6585            int                      stride,
6586            int                      bpp,
6587            int                      x,
6588            int                      y,
6589            int                      width,
6590            int                      height,
6591            uint32_t xor)
6592 {
6593     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
6594     {
6595         return _pixman_implementation_fill (
6596             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
6597     }
6598
6599     return TRUE;
6600 }
6601
6602 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6603 __attribute__((__force_align_arg_pointer__))
6604 #endif
6605 pixman_implementation_t *
6606 _pixman_implementation_create_sse2 (void)
6607 {
6608 #ifdef USE_MMX
6609     pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
6610 #else
6611     pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
6612 #endif
6613     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6614
6615     /* SSE2 constants */
6616     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6617     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6618     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6619     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6620     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6621     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6622     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6623     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6624     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6625     mask_0080 = create_mask_16_128 (0x0080);
6626     mask_00ff = create_mask_16_128 (0x00ff);
6627     mask_0101 = create_mask_16_128 (0x0101);
6628     mask_ffff = create_mask_16_128 (0xffff);
6629     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6630     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6631
6632     /* MMX constants */
6633     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
6634     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
6635
6636     mask_x0080 = create_mask_16_64 (0x0080);
6637     mask_x00ff = create_mask_16_64 (0x00ff);
6638     mask_x0101 = create_mask_16_64 (0x0101);
6639     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
6640
6641     _mm_empty ();
6642
6643     /* Set up function pointers */
6644
6645     /* SSE code patch for fbcompose.c */
6646     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6647     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6648     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6649     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6650     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6651     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6652     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6653     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6654     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6655     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6656
6657     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6658
6659     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6660     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6661     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6662     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6663     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6664     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6665     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6666     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6667     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6668     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6669     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6670
6671     imp->blt = sse2_blt;
6672     imp->fill = sse2_fill;
6673
6674     return imp;
6675 }
6676
6677 #endif /* USE_SSE2 */