Merge branch 'eliminate-composite'
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41  * the pixman-x64-mmx-emulation.h file contains
42  * implementations of those MMX intrinsics that
43  * are used in the SSE2 implementation.
44  */
45 #   include "pixman-x64-mmx-emulation.h"
46 #endif
47
48 #ifdef USE_SSE2
49
50 /* --------------------------------------------------------------------
51  * Locals
52  */
53
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
58
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
61
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
68
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
75
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
78
79 /* ----------------------------------------------------------------------
80  * SSE2 Inlines
81  */
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
84 {
85     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
86 }
87
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
90 {
91     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
93 }
94
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
97 {
98     __m128i r, g, b, rb, t;
99
100     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
103
104     rb = _mm_or_si128 (r, b);
105     t  = _mm_and_si128 (rb, mask_565_fix_rb);
106     t  = _mm_srli_epi32 (t, 5);
107     rb = _mm_or_si128 (rb, t);
108
109     t  = _mm_and_si128 (g, mask_565_fix_g);
110     t  = _mm_srli_epi32 (t, 6);
111     g  = _mm_or_si128 (g, t);
112
113     return _mm_or_si128 (rb, g);
114 }
115
116 static force_inline void
117 unpack_565_128_4x128 (__m128i  data,
118                       __m128i* data0,
119                       __m128i* data1,
120                       __m128i* data2,
121                       __m128i* data3)
122 {
123     __m128i lo, hi;
124
125     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
127
128     lo = unpack_565_to_8888 (lo);
129     hi = unpack_565_to_8888 (hi);
130
131     unpack_128_2x128 (lo, data0, data1);
132     unpack_128_2x128 (hi, data2, data3);
133 }
134
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
137 {
138     return (uint16_t) (((pixel >> 8) & 0xf800) |
139                        ((pixel >> 5) & 0x07e0) |
140                        ((pixel >> 3) & 0x001f));
141 }
142
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
145 {
146     return _mm_packus_epi16 (lo, hi);
147 }
148
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
151 {
152     __m128i data;
153     __m128i r, g1, g2, b;
154
155     data = pack_2x128_128 (lo, hi);
156
157     r  = _mm_and_si128 (data, mask_565_r);
158     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
161
162     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
163 }
164
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
167 {
168     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169                              pack_565_2x128_128 (*xmm2, *xmm3));
170 }
171
172 static force_inline int
173 is_opaque (__m128i x)
174 {
175     __m128i ffs = _mm_cmpeq_epi8 (x, x);
176
177     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
178 }
179
180 static force_inline int
181 is_zero (__m128i x)
182 {
183     return _mm_movemask_epi8 (
184         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
185 }
186
187 static force_inline int
188 is_transparent (__m128i x)
189 {
190     return (_mm_movemask_epi8 (
191                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
192 }
193
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
196 {
197     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
198 }
199
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
202 {
203     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204                                                      _MM_SHUFFLE (3, 3, 3, 3)),
205                                 _MM_SHUFFLE (3, 3, 3, 3));
206 }
207
208 static force_inline void
209 expand_alpha_2x128 (__m128i  data_lo,
210                     __m128i  data_hi,
211                     __m128i* alpha_lo,
212                     __m128i* alpha_hi)
213 {
214     __m128i lo, hi;
215
216     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
218
219     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
221 }
222
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i  data_lo,
225                         __m128i  data_hi,
226                         __m128i* alpha_lo,
227                         __m128i* alpha_hi)
228 {
229     __m128i lo, hi;
230
231     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
235 }
236
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
239                     __m128i* data_hi,
240                     __m128i* alpha_lo,
241                     __m128i* alpha_hi,
242                     __m128i* ret_lo,
243                     __m128i* ret_hi)
244 {
245     __m128i lo, hi;
246
247     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249     lo = _mm_adds_epu16 (lo, mask_0080);
250     hi = _mm_adds_epu16 (hi, mask_0080);
251     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
253 }
254
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
257                         __m128i* src_hi,
258                         __m128i* alpha_dst_lo,
259                         __m128i* alpha_dst_hi,
260                         __m128i* dst_lo,
261                         __m128i* dst_hi,
262                         __m128i* alpha_src_lo,
263                         __m128i* alpha_src_hi,
264                         __m128i* ret_lo,
265                         __m128i* ret_hi)
266 {
267     __m128i t1_lo, t1_hi;
268     __m128i t2_lo, t2_hi;
269
270     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
272
273     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
275 }
276
277 static force_inline void
278 negate_2x128 (__m128i  data_lo,
279               __m128i  data_hi,
280               __m128i* neg_lo,
281               __m128i* neg_hi)
282 {
283     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
285 }
286
287 static force_inline void
288 invert_colors_2x128 (__m128i  data_lo,
289                      __m128i  data_hi,
290                      __m128i* inv_lo,
291                      __m128i* inv_hi)
292 {
293     __m128i lo, hi;
294
295     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
299 }
300
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
303             __m128i* src_hi,
304             __m128i* alpha_lo,
305             __m128i* alpha_hi,
306             __m128i* dst_lo,
307             __m128i* dst_hi)
308 {
309     __m128i t1, t2;
310
311     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
312
313     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
314
315     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
317 }
318
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i  src_lo,
321                         __m128i  src_hi,
322                         __m128i* dst_lo,
323                         __m128i* dst_hi)
324 {
325     __m128i lo, hi;
326     __m128i alpha_lo, alpha_hi;
327
328     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
329
330     lo = _mm_or_si128 (alpha_lo, mask_alpha);
331     hi = _mm_or_si128 (alpha_hi, mask_alpha);
332
333     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
334
335     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
336
337     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
338 }
339
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
342                __m128i* src_hi,
343                __m128i* alpha_lo,
344                __m128i* alpha_hi,
345                __m128i* mask_lo,
346                __m128i* mask_hi,
347                __m128i* dst_lo,
348                __m128i* dst_hi)
349 {
350     __m128i s_lo, s_hi;
351     __m128i a_lo, a_hi;
352
353     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
355
356     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
357 }
358
359 static force_inline void
360 cache_prefetch (__m128i* addr)
361 {
362     _mm_prefetch ((void const*)addr, _MM_HINT_T0);
363 }
364
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
367 {
368     _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
369 }
370
371 /* prefetching NULL is very slow on some systems. don't do that. */
372
373 static force_inline void
374 maybe_prefetch (__m128i* addr)
375 {
376     if (addr)
377         cache_prefetch (addr);
378 }
379
380 static force_inline void
381 maybe_prefetch_next (__m128i* addr)
382 {
383     if (addr)
384         cache_prefetch_next (addr);
385 }
386
387 /* load 4 pixels from a 16-byte boundary aligned address */
388 static force_inline __m128i
389 load_128_aligned (__m128i* src)
390 {
391     return _mm_load_si128 (src);
392 }
393
394 /* load 4 pixels from a unaligned address */
395 static force_inline __m128i
396 load_128_unaligned (const __m128i* src)
397 {
398     return _mm_loadu_si128 (src);
399 }
400
401 /* save 4 pixels using Write Combining memory on a 16-byte
402  * boundary aligned address
403  */
404 static force_inline void
405 save_128_write_combining (__m128i* dst,
406                           __m128i  data)
407 {
408     _mm_stream_si128 (dst, data);
409 }
410
411 /* save 4 pixels on a 16-byte boundary aligned address */
412 static force_inline void
413 save_128_aligned (__m128i* dst,
414                   __m128i  data)
415 {
416     _mm_store_si128 (dst, data);
417 }
418
419 /* save 4 pixels on a unaligned address */
420 static force_inline void
421 save_128_unaligned (__m128i* dst,
422                     __m128i  data)
423 {
424     _mm_storeu_si128 (dst, data);
425 }
426
427 /* ------------------------------------------------------------------
428  * MMX inlines
429  */
430
431 static force_inline __m64
432 load_32_1x64 (uint32_t data)
433 {
434     return _mm_cvtsi32_si64 (data);
435 }
436
437 static force_inline __m64
438 unpack_32_1x64 (uint32_t data)
439 {
440     return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
441 }
442
443 static force_inline __m64
444 expand_alpha_1x64 (__m64 data)
445 {
446     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
447 }
448
449 static force_inline __m64
450 expand_alpha_rev_1x64 (__m64 data)
451 {
452     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
453 }
454
455 static force_inline __m64
456 expand_pixel_8_1x64 (uint8_t data)
457 {
458     return _mm_shuffle_pi16 (
459         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
460 }
461
462 static force_inline __m64
463 pix_multiply_1x64 (__m64 data,
464                    __m64 alpha)
465 {
466     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
467                                           mask_x0080),
468                            mask_x0101);
469 }
470
471 static force_inline __m64
472 pix_add_multiply_1x64 (__m64* src,
473                        __m64* alpha_dst,
474                        __m64* dst,
475                        __m64* alpha_src)
476 {
477     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
478     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
479
480     return _mm_adds_pu8 (t1, t2);
481 }
482
483 static force_inline __m64
484 negate_1x64 (__m64 data)
485 {
486     return _mm_xor_si64 (data, mask_x00ff);
487 }
488
489 static force_inline __m64
490 invert_colors_1x64 (__m64 data)
491 {
492     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
493 }
494
495 static force_inline __m64
496 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
497 {
498     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
499 }
500
501 static force_inline __m64
502 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
503 {
504     return over_1x64 (pix_multiply_1x64 (*src, *mask),
505                       pix_multiply_1x64 (*alpha, *mask),
506                       *dst);
507 }
508
509 static force_inline __m64
510 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
511 {
512     __m64 alpha = expand_alpha_1x64 (src);
513
514     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
515                                          _mm_or_si64 (alpha, mask_x_alpha)),
516                       alpha,
517                       dst);
518 }
519
520 static force_inline uint32_t
521 pack_1x64_32 (__m64 data)
522 {
523     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
524 }
525
526 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
527  *
528  *    00RR00GG00BB
529  *
530  * --- Expanding 565 in the low word ---
531  *
532  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
533  * m = m & (01f0003f001f);
534  * m = m * (008404100840);
535  * m = m >> 8;
536  *
537  * Note the trick here - the top word is shifted by another nibble to
538  * avoid it bumping into the middle word
539  */
540 static force_inline __m64
541 expand565_16_1x64 (uint16_t pixel)
542 {
543     __m64 p;
544     __m64 t1, t2;
545
546     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
547
548     t1 = _mm_slli_si64 (p, 36 - 11);
549     t2 = _mm_slli_si64 (p, 16 - 5);
550
551     p = _mm_or_si64 (t1, p);
552     p = _mm_or_si64 (t2, p);
553     p = _mm_and_si64 (p, mask_x565_rgb);
554     p = _mm_mullo_pi16 (p, mask_x565_unpack);
555
556     return _mm_srli_pi16 (p, 8);
557 }
558
559 /* ----------------------------------------------------------------------------
560  * Compose Core transformations
561  */
562 static force_inline uint32_t
563 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
564 {
565     uint8_t a;
566     __m64 ms;
567
568     a = src >> 24;
569
570     if (a == 0xff)
571     {
572         return src;
573     }
574     else if (src)
575     {
576         ms = unpack_32_1x64 (src);
577         return pack_1x64_32 (
578             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
579     }
580
581     return dst;
582 }
583
584 static force_inline uint32_t
585 combine1 (const uint32_t *ps, const uint32_t *pm)
586 {
587     uint32_t s = *ps;
588
589     if (pm)
590     {
591         __m64 ms, mm;
592
593         mm = unpack_32_1x64 (*pm);
594         mm = expand_alpha_1x64 (mm);
595
596         ms = unpack_32_1x64 (s);
597         ms = pix_multiply_1x64 (ms, mm);
598
599         s = pack_1x64_32 (ms);
600     }
601
602     return s;
603 }
604
605 static force_inline __m128i
606 combine4 (const __m128i *ps, const __m128i *pm)
607 {
608     __m128i xmm_src_lo, xmm_src_hi;
609     __m128i xmm_msk_lo, xmm_msk_hi;
610     __m128i s;
611
612     if (pm)
613     {
614         xmm_msk_lo = load_128_unaligned (pm);
615
616         if (is_transparent (xmm_msk_lo))
617             return _mm_setzero_si128 ();
618     }
619
620     s = load_128_unaligned (ps);
621
622     if (pm)
623     {
624         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
625         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
626
627         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
628
629         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
630                             &xmm_msk_lo, &xmm_msk_hi,
631                             &xmm_src_lo, &xmm_src_hi);
632
633         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
634     }
635
636     return s;
637 }
638
639 static force_inline void
640 core_combine_over_u_sse2 (uint32_t*       pd,
641                           const uint32_t* ps,
642                           const uint32_t* pm,
643                           int             w)
644 {
645     uint32_t s, d;
646
647     __m128i xmm_dst_lo, xmm_dst_hi;
648     __m128i xmm_src_lo, xmm_src_hi;
649     __m128i xmm_alpha_lo, xmm_alpha_hi;
650
651     /* call prefetch hint to optimize cache load*/
652     cache_prefetch ((__m128i*)ps);
653     cache_prefetch ((__m128i*)pd);
654     maybe_prefetch ((__m128i*)pm);
655
656     /* Align dst on a 16-byte boundary */
657     while (w && ((unsigned long)pd & 15))
658     {
659         d = *pd;
660         s = combine1 (ps, pm);
661
662         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
663         ps++;
664         if (pm)
665             pm++;
666         w--;
667     }
668
669     /* call prefetch hint to optimize cache load*/
670     cache_prefetch ((__m128i*)ps);
671     cache_prefetch ((__m128i*)pd);
672     maybe_prefetch ((__m128i*)pm);
673
674     while (w >= 4)
675     {
676         /* fill cache line with next memory */
677         cache_prefetch_next ((__m128i*)ps);
678         cache_prefetch_next ((__m128i*)pd);
679         maybe_prefetch_next ((__m128i*)pm);
680
681         /* I'm loading unaligned because I'm not sure about
682          * the address alignment.
683          */
684         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
685
686         if (is_opaque (xmm_src_hi))
687         {
688             save_128_aligned ((__m128i*)pd, xmm_src_hi);
689         }
690         else if (!is_zero (xmm_src_hi))
691         {
692             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
693
694             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
695             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
696
697             expand_alpha_2x128 (
698                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
699
700             over_2x128 (&xmm_src_lo, &xmm_src_hi,
701                         &xmm_alpha_lo, &xmm_alpha_hi,
702                         &xmm_dst_lo, &xmm_dst_hi);
703
704             /* rebuid the 4 pixel data and save*/
705             save_128_aligned ((__m128i*)pd,
706                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
707         }
708
709         w -= 4;
710         ps += 4;
711         pd += 4;
712         if (pm)
713             pm += 4;
714     }
715
716     while (w)
717     {
718         d = *pd;
719         s = combine1 (ps, pm);
720
721         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
722         ps++;
723         if (pm)
724             pm++;
725
726         w--;
727     }
728 }
729
730 static force_inline void
731 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
732                                   const uint32_t* ps,
733                                   const uint32_t* pm,
734                                   int             w)
735 {
736     uint32_t s, d;
737
738     __m128i xmm_dst_lo, xmm_dst_hi;
739     __m128i xmm_src_lo, xmm_src_hi;
740     __m128i xmm_alpha_lo, xmm_alpha_hi;
741
742     /* call prefetch hint to optimize cache load*/
743     cache_prefetch ((__m128i*)ps);
744     cache_prefetch ((__m128i*)pd);
745     maybe_prefetch ((__m128i*)pm);
746
747     /* Align dst on a 16-byte boundary */
748     while (w &&
749            ((unsigned long)pd & 15))
750     {
751         d = *pd;
752         s = combine1 (ps, pm);
753
754         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
755         w--;
756         ps++;
757         if (pm)
758             pm++;
759     }
760
761     /* call prefetch hint to optimize cache load*/
762     cache_prefetch ((__m128i*)ps);
763     cache_prefetch ((__m128i*)pd);
764     maybe_prefetch ((__m128i*)pm);
765
766     while (w >= 4)
767     {
768         /* fill cache line with next memory */
769         cache_prefetch_next ((__m128i*)ps);
770         cache_prefetch_next ((__m128i*)pd);
771         maybe_prefetch_next ((__m128i*)pm);
772
773         /* I'm loading unaligned because I'm not sure
774          * about the address alignment.
775          */
776         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
777         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
778
779         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
780         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
781
782         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
783                             &xmm_alpha_lo, &xmm_alpha_hi);
784
785         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
786                     &xmm_alpha_lo, &xmm_alpha_hi,
787                     &xmm_src_lo, &xmm_src_hi);
788
789         /* rebuid the 4 pixel data and save*/
790         save_128_aligned ((__m128i*)pd,
791                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
792
793         w -= 4;
794         ps += 4;
795         pd += 4;
796
797         if (pm)
798             pm += 4;
799     }
800
801     while (w)
802     {
803         d = *pd;
804         s = combine1 (ps, pm);
805
806         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
807         ps++;
808         w--;
809         if (pm)
810             pm++;
811     }
812 }
813
814 static force_inline uint32_t
815 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
816 {
817     uint32_t maska = src >> 24;
818
819     if (maska == 0)
820     {
821         return 0;
822     }
823     else if (maska != 0xff)
824     {
825         return pack_1x64_32 (
826             pix_multiply_1x64 (unpack_32_1x64 (dst),
827                                expand_alpha_1x64 (unpack_32_1x64 (src))));
828     }
829
830     return dst;
831 }
832
833 static force_inline void
834 core_combine_in_u_sse2 (uint32_t*       pd,
835                         const uint32_t* ps,
836                         const uint32_t* pm,
837                         int             w)
838 {
839     uint32_t s, d;
840
841     __m128i xmm_src_lo, xmm_src_hi;
842     __m128i xmm_dst_lo, xmm_dst_hi;
843
844     /* call prefetch hint to optimize cache load*/
845     cache_prefetch ((__m128i*)ps);
846     cache_prefetch ((__m128i*)pd);
847     maybe_prefetch ((__m128i*)pm);
848
849     while (w && ((unsigned long) pd & 15))
850     {
851         s = combine1 (ps, pm);
852         d = *pd;
853
854         *pd++ = core_combine_in_u_pixelsse2 (d, s);
855         w--;
856         ps++;
857         if (pm)
858             pm++;
859     }
860
861     /* call prefetch hint to optimize cache load*/
862     cache_prefetch ((__m128i*)ps);
863     cache_prefetch ((__m128i*)pd);
864     maybe_prefetch ((__m128i*)pm);
865
866     while (w >= 4)
867     {
868         /* fill cache line with next memory */
869         cache_prefetch_next ((__m128i*)ps);
870         cache_prefetch_next ((__m128i*)pd);
871         maybe_prefetch_next ((__m128i*)pm);
872
873         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
874         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
875
876         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
877         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
878
879         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
880         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
881                             &xmm_dst_lo, &xmm_dst_hi,
882                             &xmm_dst_lo, &xmm_dst_hi);
883
884         save_128_aligned ((__m128i*)pd,
885                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
886
887         ps += 4;
888         pd += 4;
889         w -= 4;
890         if (pm)
891             pm += 4;
892     }
893
894     while (w)
895     {
896         s = combine1 (ps, pm);
897         d = *pd;
898
899         *pd++ = core_combine_in_u_pixelsse2 (d, s);
900         w--;
901         ps++;
902         if (pm)
903             pm++;
904     }
905 }
906
907 static force_inline void
908 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
909                                 const uint32_t* ps,
910                                 const uint32_t *pm,
911                                 int             w)
912 {
913     uint32_t s, d;
914
915     __m128i xmm_src_lo, xmm_src_hi;
916     __m128i xmm_dst_lo, xmm_dst_hi;
917
918     /* call prefetch hint to optimize cache load*/
919     cache_prefetch ((__m128i*)ps);
920     cache_prefetch ((__m128i*)pd);
921     maybe_prefetch ((__m128i*)pm);
922
923     while (w && ((unsigned long) pd & 15))
924     {
925         s = combine1 (ps, pm);
926         d = *pd;
927
928         *pd++ = core_combine_in_u_pixelsse2 (s, d);
929         ps++;
930         w--;
931         if (pm)
932             pm++;
933     }
934
935     /* call prefetch hint to optimize cache load*/
936     cache_prefetch ((__m128i*)ps);
937     cache_prefetch ((__m128i*)pd);
938     maybe_prefetch ((__m128i*)pm);
939
940     while (w >= 4)
941     {
942         /* fill cache line with next memory */
943         cache_prefetch_next ((__m128i*)ps);
944         cache_prefetch_next ((__m128i*)pd);
945         maybe_prefetch_next ((__m128i*)pm);
946
947         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
948         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
949
950         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
951         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
952
953         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
954         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
955                             &xmm_src_lo, &xmm_src_hi,
956                             &xmm_dst_lo, &xmm_dst_hi);
957
958         save_128_aligned (
959             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
960
961         ps += 4;
962         pd += 4;
963         w -= 4;
964         if (pm)
965             pm += 4;
966     }
967
968     while (w)
969     {
970         s = combine1 (ps, pm);
971         d = *pd;
972
973         *pd++ = core_combine_in_u_pixelsse2 (s, d);
974         w--;
975         ps++;
976         if (pm)
977             pm++;
978     }
979 }
980
981 static force_inline void
982 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
983                                  const uint32_t* ps,
984                                  const uint32_t* pm,
985                                  int             w)
986 {
987     /* call prefetch hint to optimize cache load*/
988     cache_prefetch ((__m128i*)ps);
989     cache_prefetch ((__m128i*)pd);
990     maybe_prefetch ((__m128i*)pm);
991
992     while (w && ((unsigned long) pd & 15))
993     {
994         uint32_t s = combine1 (ps, pm);
995         uint32_t d = *pd;
996
997         *pd++ = pack_1x64_32 (
998             pix_multiply_1x64 (
999                 unpack_32_1x64 (d), negate_1x64 (
1000                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1001
1002         if (pm)
1003             pm++;
1004         ps++;
1005         w--;
1006     }
1007
1008     /* call prefetch hint to optimize cache load*/
1009     cache_prefetch ((__m128i*)ps);
1010     cache_prefetch ((__m128i*)pd);
1011     maybe_prefetch ((__m128i*)pm);
1012
1013     while (w >= 4)
1014     {
1015         __m128i xmm_src_lo, xmm_src_hi;
1016         __m128i xmm_dst_lo, xmm_dst_hi;
1017
1018         /* fill cache line with next memory */
1019         cache_prefetch_next ((__m128i*)ps);
1020         cache_prefetch_next ((__m128i*)pd);
1021         maybe_prefetch_next ((__m128i*)pm);
1022
1023         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1024         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1025
1026         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1027         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1028
1029         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1030         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1031
1032         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1033                             &xmm_src_lo, &xmm_src_hi,
1034                             &xmm_dst_lo, &xmm_dst_hi);
1035
1036         save_128_aligned (
1037             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1038
1039         ps += 4;
1040         pd += 4;
1041         if (pm)
1042             pm += 4;
1043
1044         w -= 4;
1045     }
1046
1047     while (w)
1048     {
1049         uint32_t s = combine1 (ps, pm);
1050         uint32_t d = *pd;
1051
1052         *pd++ = pack_1x64_32 (
1053             pix_multiply_1x64 (
1054                 unpack_32_1x64 (d), negate_1x64 (
1055                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1056         ps++;
1057         if (pm)
1058             pm++;
1059         w--;
1060     }
1061 }
1062
1063 static force_inline void
1064 core_combine_out_u_sse2 (uint32_t*       pd,
1065                          const uint32_t* ps,
1066                          const uint32_t* pm,
1067                          int             w)
1068 {
1069     /* call prefetch hint to optimize cache load*/
1070     cache_prefetch ((__m128i*)ps);
1071     cache_prefetch ((__m128i*)pd);
1072     maybe_prefetch ((__m128i*)pm);
1073
1074     while (w && ((unsigned long) pd & 15))
1075     {
1076         uint32_t s = combine1 (ps, pm);
1077         uint32_t d = *pd;
1078
1079         *pd++ = pack_1x64_32 (
1080             pix_multiply_1x64 (
1081                 unpack_32_1x64 (s), negate_1x64 (
1082                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1083         w--;
1084         ps++;
1085         if (pm)
1086             pm++;
1087     }
1088
1089     /* call prefetch hint to optimize cache load*/
1090     cache_prefetch ((__m128i*)ps);
1091     cache_prefetch ((__m128i*)pd);
1092     maybe_prefetch ((__m128i*)pm);
1093
1094     while (w >= 4)
1095     {
1096         __m128i xmm_src_lo, xmm_src_hi;
1097         __m128i xmm_dst_lo, xmm_dst_hi;
1098
1099         /* fill cache line with next memory */
1100         cache_prefetch_next ((__m128i*)ps);
1101         cache_prefetch_next ((__m128i*)pd);
1102         maybe_prefetch_next ((__m128i*)pm);
1103
1104         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1105         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1106
1107         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1108         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1109
1110         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1111         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1112
1113         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1114                             &xmm_dst_lo, &xmm_dst_hi,
1115                             &xmm_dst_lo, &xmm_dst_hi);
1116
1117         save_128_aligned (
1118             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1119
1120         ps += 4;
1121         pd += 4;
1122         w -= 4;
1123         if (pm)
1124             pm += 4;
1125     }
1126
1127     while (w)
1128     {
1129         uint32_t s = combine1 (ps, pm);
1130         uint32_t d = *pd;
1131
1132         *pd++ = pack_1x64_32 (
1133             pix_multiply_1x64 (
1134                 unpack_32_1x64 (s), negate_1x64 (
1135                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1136         w--;
1137         ps++;
1138         if (pm)
1139             pm++;
1140     }
1141 }
1142
1143 static force_inline uint32_t
1144 core_combine_atop_u_pixel_sse2 (uint32_t src,
1145                                 uint32_t dst)
1146 {
1147     __m64 s = unpack_32_1x64 (src);
1148     __m64 d = unpack_32_1x64 (dst);
1149
1150     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1151     __m64 da = expand_alpha_1x64 (d);
1152
1153     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1154 }
1155
1156 static force_inline void
1157 core_combine_atop_u_sse2 (uint32_t*       pd,
1158                           const uint32_t* ps,
1159                           const uint32_t* pm,
1160                           int             w)
1161 {
1162     uint32_t s, d;
1163
1164     __m128i xmm_src_lo, xmm_src_hi;
1165     __m128i xmm_dst_lo, xmm_dst_hi;
1166     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1167     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1168
1169     /* call prefetch hint to optimize cache load*/
1170     cache_prefetch ((__m128i*)ps);
1171     cache_prefetch ((__m128i*)pd);
1172     maybe_prefetch ((__m128i*)pm);
1173
1174     while (w && ((unsigned long) pd & 15))
1175     {
1176         s = combine1 (ps, pm);
1177         d = *pd;
1178
1179         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1180         w--;
1181         ps++;
1182         if (pm)
1183             pm++;
1184     }
1185
1186     /* call prefetch hint to optimize cache load*/
1187     cache_prefetch ((__m128i*)ps);
1188     cache_prefetch ((__m128i*)pd);
1189     maybe_prefetch ((__m128i*)pm);
1190
1191     while (w >= 4)
1192     {
1193         /* fill cache line with next memory */
1194         cache_prefetch_next ((__m128i*)ps);
1195         cache_prefetch_next ((__m128i*)pd);
1196         maybe_prefetch_next ((__m128i*)pm);
1197
1198         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1199         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1200
1201         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1202         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1203
1204         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1205                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1206         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1207                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1208
1209         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1210                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1211
1212         pix_add_multiply_2x128 (
1213             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1214             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1215             &xmm_dst_lo, &xmm_dst_hi);
1216
1217         save_128_aligned (
1218             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1219
1220         ps += 4;
1221         pd += 4;
1222         w -= 4;
1223         if (pm)
1224             pm += 4;
1225     }
1226
1227     while (w)
1228     {
1229         s = combine1 (ps, pm);
1230         d = *pd;
1231
1232         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1233         w--;
1234         ps++;
1235         if (pm)
1236             pm++;
1237     }
1238 }
1239
1240 static force_inline uint32_t
1241 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1242                                         uint32_t dst)
1243 {
1244     __m64 s = unpack_32_1x64 (src);
1245     __m64 d = unpack_32_1x64 (dst);
1246
1247     __m64 sa = expand_alpha_1x64 (s);
1248     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1249
1250     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1251 }
1252
1253 static force_inline void
1254 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1255                                   const uint32_t* ps,
1256                                   const uint32_t* pm,
1257                                   int             w)
1258 {
1259     uint32_t s, d;
1260
1261     __m128i xmm_src_lo, xmm_src_hi;
1262     __m128i xmm_dst_lo, xmm_dst_hi;
1263     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1264     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1265
1266     /* call prefetch hint to optimize cache load*/
1267     cache_prefetch ((__m128i*)ps);
1268     cache_prefetch ((__m128i*)pd);
1269     maybe_prefetch ((__m128i*)pm);
1270
1271     while (w && ((unsigned long) pd & 15))
1272     {
1273         s = combine1 (ps, pm);
1274         d = *pd;
1275
1276         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1277         ps++;
1278         w--;
1279         if (pm)
1280             pm++;
1281     }
1282
1283     /* call prefetch hint to optimize cache load*/
1284     cache_prefetch ((__m128i*)ps);
1285     cache_prefetch ((__m128i*)pd);
1286     maybe_prefetch ((__m128i*)pm);
1287
1288     while (w >= 4)
1289     {
1290         /* fill cache line with next memory */
1291         cache_prefetch_next ((__m128i*)ps);
1292         cache_prefetch_next ((__m128i*)pd);
1293         maybe_prefetch_next ((__m128i*)pm);
1294
1295         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1296         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1297
1298         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1299         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1300
1301         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1302                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1303         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1304                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1305
1306         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1307                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1308
1309         pix_add_multiply_2x128 (
1310             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1311             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1312             &xmm_dst_lo, &xmm_dst_hi);
1313
1314         save_128_aligned (
1315             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1316
1317         ps += 4;
1318         pd += 4;
1319         w -= 4;
1320         if (pm)
1321             pm += 4;
1322     }
1323
1324     while (w)
1325     {
1326         s = combine1 (ps, pm);
1327         d = *pd;
1328
1329         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1330         ps++;
1331         w--;
1332         if (pm)
1333             pm++;
1334     }
1335 }
1336
1337 static force_inline uint32_t
1338 core_combine_xor_u_pixel_sse2 (uint32_t src,
1339                                uint32_t dst)
1340 {
1341     __m64 s = unpack_32_1x64 (src);
1342     __m64 d = unpack_32_1x64 (dst);
1343
1344     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1345     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1346
1347     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1348 }
1349
1350 static force_inline void
1351 core_combine_xor_u_sse2 (uint32_t*       dst,
1352                          const uint32_t* src,
1353                          const uint32_t *mask,
1354                          int             width)
1355 {
1356     int w = width;
1357     uint32_t s, d;
1358     uint32_t* pd = dst;
1359     const uint32_t* ps = src;
1360     const uint32_t* pm = mask;
1361
1362     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1363     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1364     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1365     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1366
1367     /* call prefetch hint to optimize cache load*/
1368     cache_prefetch ((__m128i*)ps);
1369     cache_prefetch ((__m128i*)pd);
1370     maybe_prefetch ((__m128i*)pm);
1371
1372     while (w && ((unsigned long) pd & 15))
1373     {
1374         s = combine1 (ps, pm);
1375         d = *pd;
1376
1377         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1378         w--;
1379         ps++;
1380         if (pm)
1381             pm++;
1382     }
1383
1384     /* call prefetch hint to optimize cache load*/
1385     cache_prefetch ((__m128i*)ps);
1386     cache_prefetch ((__m128i*)pd);
1387     maybe_prefetch ((__m128i*)pm);
1388
1389     while (w >= 4)
1390     {
1391         /* fill cache line with next memory */
1392         cache_prefetch_next ((__m128i*)ps);
1393         cache_prefetch_next ((__m128i*)pd);
1394         maybe_prefetch_next ((__m128i*)pm);
1395
1396         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1397         xmm_dst = load_128_aligned ((__m128i*) pd);
1398
1399         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1400         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1401
1402         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1403                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1404         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1405                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1406
1407         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1408                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1409         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1410                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1411
1412         pix_add_multiply_2x128 (
1413             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1414             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1415             &xmm_dst_lo, &xmm_dst_hi);
1416
1417         save_128_aligned (
1418             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1419
1420         ps += 4;
1421         pd += 4;
1422         w -= 4;
1423         if (pm)
1424             pm += 4;
1425     }
1426
1427     while (w)
1428     {
1429         s = combine1 (ps, pm);
1430         d = *pd;
1431
1432         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1433         w--;
1434         ps++;
1435         if (pm)
1436             pm++;
1437     }
1438 }
1439
1440 static force_inline void
1441 core_combine_add_u_sse2 (uint32_t*       dst,
1442                          const uint32_t* src,
1443                          const uint32_t* mask,
1444                          int             width)
1445 {
1446     int w = width;
1447     uint32_t s, d;
1448     uint32_t* pd = dst;
1449     const uint32_t* ps = src;
1450     const uint32_t* pm = mask;
1451
1452     /* call prefetch hint to optimize cache load*/
1453     cache_prefetch ((__m128i*)ps);
1454     cache_prefetch ((__m128i*)pd);
1455     maybe_prefetch ((__m128i*)pm);
1456
1457     while (w && (unsigned long)pd & 15)
1458     {
1459         s = combine1 (ps, pm);
1460         d = *pd;
1461
1462         ps++;
1463         if (pm)
1464             pm++;
1465         *pd++ = _mm_cvtsi64_si32 (
1466             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1467         w--;
1468     }
1469
1470     /* call prefetch hint to optimize cache load*/
1471     cache_prefetch ((__m128i*)ps);
1472     cache_prefetch ((__m128i*)pd);
1473     maybe_prefetch ((__m128i*)pm);
1474
1475     while (w >= 4)
1476     {
1477         __m128i s;
1478
1479         /* fill cache line with next memory */
1480         cache_prefetch_next ((__m128i*)ps);
1481         cache_prefetch_next ((__m128i*)pd);
1482         maybe_prefetch_next ((__m128i*)pm);
1483
1484         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1485
1486         save_128_aligned (
1487             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1488
1489         pd += 4;
1490         ps += 4;
1491         if (pm)
1492             pm += 4;
1493         w -= 4;
1494     }
1495
1496     while (w--)
1497     {
1498         s = combine1 (ps, pm);
1499         d = *pd;
1500
1501         ps++;
1502         *pd++ = _mm_cvtsi64_si32 (
1503             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1504         if (pm)
1505             pm++;
1506     }
1507 }
1508
1509 static force_inline uint32_t
1510 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1511                                     uint32_t dst)
1512 {
1513     __m64 ms = unpack_32_1x64 (src);
1514     __m64 md = unpack_32_1x64 (dst);
1515     uint32_t sa = src >> 24;
1516     uint32_t da = ~dst >> 24;
1517
1518     if (sa > da)
1519     {
1520         ms = pix_multiply_1x64 (
1521             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1522     }
1523
1524     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1525 }
1526
1527 static force_inline void
1528 core_combine_saturate_u_sse2 (uint32_t *      pd,
1529                               const uint32_t *ps,
1530                               const uint32_t *pm,
1531                               int             w)
1532 {
1533     uint32_t s, d;
1534
1535     uint32_t pack_cmp;
1536     __m128i xmm_src, xmm_dst;
1537
1538     /* call prefetch hint to optimize cache load*/
1539     cache_prefetch ((__m128i*)ps);
1540     cache_prefetch ((__m128i*)pd);
1541     maybe_prefetch ((__m128i*)pm);
1542
1543     while (w && (unsigned long)pd & 15)
1544     {
1545         s = combine1 (ps, pm);
1546         d = *pd;
1547
1548         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1549         w--;
1550         ps++;
1551         if (pm)
1552             pm++;
1553     }
1554
1555     /* call prefetch hint to optimize cache load*/
1556     cache_prefetch ((__m128i*)ps);
1557     cache_prefetch ((__m128i*)pd);
1558     maybe_prefetch ((__m128i*)pm);
1559
1560     while (w >= 4)
1561     {
1562         /* fill cache line with next memory */
1563         cache_prefetch_next ((__m128i*)ps);
1564         cache_prefetch_next ((__m128i*)pd);
1565         maybe_prefetch_next ((__m128i*)pm);
1566
1567         xmm_dst = load_128_aligned  ((__m128i*)pd);
1568         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1569
1570         pack_cmp = _mm_movemask_epi8 (
1571             _mm_cmpgt_epi32 (
1572                 _mm_srli_epi32 (xmm_src, 24),
1573                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1574
1575         /* if some alpha src is grater than respective ~alpha dst */
1576         if (pack_cmp)
1577         {
1578             s = combine1 (ps++, pm);
1579             d = *pd;
1580             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1581             if (pm)
1582                 pm++;
1583
1584             s = combine1 (ps++, pm);
1585             d = *pd;
1586             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1587             if (pm)
1588                 pm++;
1589
1590             s = combine1 (ps++, pm);
1591             d = *pd;
1592             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1593             if (pm)
1594                 pm++;
1595
1596             s = combine1 (ps++, pm);
1597             d = *pd;
1598             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1599             if (pm)
1600                 pm++;
1601         }
1602         else
1603         {
1604             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1605
1606             pd += 4;
1607             ps += 4;
1608             if (pm)
1609                 pm += 4;
1610         }
1611
1612         w -= 4;
1613     }
1614
1615     while (w--)
1616     {
1617         s = combine1 (ps, pm);
1618         d = *pd;
1619
1620         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1621         ps++;
1622         if (pm)
1623             pm++;
1624     }
1625 }
1626
1627 static force_inline void
1628 core_combine_src_ca_sse2 (uint32_t*       pd,
1629                           const uint32_t* ps,
1630                           const uint32_t *pm,
1631                           int             w)
1632 {
1633     uint32_t s, m;
1634
1635     __m128i xmm_src_lo, xmm_src_hi;
1636     __m128i xmm_mask_lo, xmm_mask_hi;
1637     __m128i xmm_dst_lo, xmm_dst_hi;
1638
1639     /* call prefetch hint to optimize cache load*/
1640     cache_prefetch ((__m128i*)ps);
1641     cache_prefetch ((__m128i*)pd);
1642     cache_prefetch ((__m128i*)pm);
1643
1644     while (w && (unsigned long)pd & 15)
1645     {
1646         s = *ps++;
1647         m = *pm++;
1648         *pd++ = pack_1x64_32 (
1649             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1650         w--;
1651     }
1652
1653     /* call prefetch hint to optimize cache load*/
1654     cache_prefetch ((__m128i*)ps);
1655     cache_prefetch ((__m128i*)pd);
1656     cache_prefetch ((__m128i*)pm);
1657
1658     while (w >= 4)
1659     {
1660         /* fill cache line with next memory */
1661         cache_prefetch_next ((__m128i*)ps);
1662         cache_prefetch_next ((__m128i*)pd);
1663         cache_prefetch_next ((__m128i*)pm);
1664
1665         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1666         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1667
1668         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1669         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1670
1671         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1672                             &xmm_mask_lo, &xmm_mask_hi,
1673                             &xmm_dst_lo, &xmm_dst_hi);
1674
1675         save_128_aligned (
1676             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1677
1678         ps += 4;
1679         pd += 4;
1680         pm += 4;
1681         w -= 4;
1682     }
1683
1684     while (w)
1685     {
1686         s = *ps++;
1687         m = *pm++;
1688         *pd++ = pack_1x64_32 (
1689             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1690         w--;
1691     }
1692 }
1693
1694 static force_inline uint32_t
1695 core_combine_over_ca_pixel_sse2 (uint32_t src,
1696                                  uint32_t mask,
1697                                  uint32_t dst)
1698 {
1699     __m64 s = unpack_32_1x64 (src);
1700     __m64 expAlpha = expand_alpha_1x64 (s);
1701     __m64 unpk_mask = unpack_32_1x64 (mask);
1702     __m64 unpk_dst  = unpack_32_1x64 (dst);
1703
1704     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1705 }
1706
1707 static force_inline void
1708 core_combine_over_ca_sse2 (uint32_t*       pd,
1709                            const uint32_t* ps,
1710                            const uint32_t *pm,
1711                            int             w)
1712 {
1713     uint32_t s, m, d;
1714
1715     __m128i xmm_alpha_lo, xmm_alpha_hi;
1716     __m128i xmm_src_lo, xmm_src_hi;
1717     __m128i xmm_dst_lo, xmm_dst_hi;
1718     __m128i xmm_mask_lo, xmm_mask_hi;
1719
1720     /* call prefetch hint to optimize cache load*/
1721     cache_prefetch ((__m128i*)ps);
1722     cache_prefetch ((__m128i*)pd);
1723     cache_prefetch ((__m128i*)pm);
1724
1725     while (w && (unsigned long)pd & 15)
1726     {
1727         s = *ps++;
1728         m = *pm++;
1729         d = *pd;
1730
1731         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1732         w--;
1733     }
1734
1735     /* call prefetch hint to optimize cache load*/
1736     cache_prefetch ((__m128i*)ps);
1737     cache_prefetch ((__m128i*)pd);
1738     cache_prefetch ((__m128i*)pm);
1739
1740     while (w >= 4)
1741     {
1742         /* fill cache line with next memory */
1743         cache_prefetch_next ((__m128i*)ps);
1744         cache_prefetch_next ((__m128i*)pd);
1745         cache_prefetch_next ((__m128i*)pm);
1746
1747         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1748         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1749         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1750
1751         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1752         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1753         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1754
1755         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1756                             &xmm_alpha_lo, &xmm_alpha_hi);
1757
1758         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1759                        &xmm_alpha_lo, &xmm_alpha_hi,
1760                        &xmm_mask_lo, &xmm_mask_hi,
1761                        &xmm_dst_lo, &xmm_dst_hi);
1762
1763         save_128_aligned (
1764             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1765
1766         ps += 4;
1767         pd += 4;
1768         pm += 4;
1769         w -= 4;
1770     }
1771
1772     while (w)
1773     {
1774         s = *ps++;
1775         m = *pm++;
1776         d = *pd;
1777
1778         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1779         w--;
1780     }
1781 }
1782
1783 static force_inline uint32_t
1784 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1785                                          uint32_t mask,
1786                                          uint32_t dst)
1787 {
1788     __m64 d = unpack_32_1x64 (dst);
1789
1790     return pack_1x64_32 (
1791         over_1x64 (d, expand_alpha_1x64 (d),
1792                    pix_multiply_1x64 (unpack_32_1x64 (src),
1793                                       unpack_32_1x64 (mask))));
1794 }
1795
1796 static force_inline void
1797 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1798                                    const uint32_t* ps,
1799                                    const uint32_t *pm,
1800                                    int             w)
1801 {
1802     uint32_t s, m, d;
1803
1804     __m128i xmm_alpha_lo, xmm_alpha_hi;
1805     __m128i xmm_src_lo, xmm_src_hi;
1806     __m128i xmm_dst_lo, xmm_dst_hi;
1807     __m128i xmm_mask_lo, xmm_mask_hi;
1808
1809     /* call prefetch hint to optimize cache load*/
1810     cache_prefetch ((__m128i*)ps);
1811     cache_prefetch ((__m128i*)pd);
1812     cache_prefetch ((__m128i*)pm);
1813
1814     while (w && (unsigned long)pd & 15)
1815     {
1816         s = *ps++;
1817         m = *pm++;
1818         d = *pd;
1819
1820         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1821         w--;
1822     }
1823
1824     /* call prefetch hint to optimize cache load*/
1825     cache_prefetch ((__m128i*)ps);
1826     cache_prefetch ((__m128i*)pd);
1827     cache_prefetch ((__m128i*)pm);
1828
1829     while (w >= 4)
1830     {
1831         /* fill cache line with next memory */
1832         cache_prefetch_next ((__m128i*)ps);
1833         cache_prefetch_next ((__m128i*)pd);
1834         cache_prefetch_next ((__m128i*)pm);
1835
1836         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1837         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1838         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1839
1840         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1841         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1842         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1843
1844         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1845                             &xmm_alpha_lo, &xmm_alpha_hi);
1846         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1847                             &xmm_mask_lo, &xmm_mask_hi,
1848                             &xmm_mask_lo, &xmm_mask_hi);
1849
1850         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1851                     &xmm_alpha_lo, &xmm_alpha_hi,
1852                     &xmm_mask_lo, &xmm_mask_hi);
1853
1854         save_128_aligned (
1855             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1856
1857         ps += 4;
1858         pd += 4;
1859         pm += 4;
1860         w -= 4;
1861     }
1862
1863     while (w)
1864     {
1865         s = *ps++;
1866         m = *pm++;
1867         d = *pd;
1868
1869         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1870         w--;
1871     }
1872 }
1873
1874 static force_inline void
1875 core_combine_in_ca_sse2 (uint32_t *      pd,
1876                          const uint32_t *ps,
1877                          const uint32_t *pm,
1878                          int             w)
1879 {
1880     uint32_t s, m, d;
1881
1882     __m128i xmm_alpha_lo, xmm_alpha_hi;
1883     __m128i xmm_src_lo, xmm_src_hi;
1884     __m128i xmm_dst_lo, xmm_dst_hi;
1885     __m128i xmm_mask_lo, xmm_mask_hi;
1886
1887     /* call prefetch hint to optimize cache load*/
1888     cache_prefetch ((__m128i*)ps);
1889     cache_prefetch ((__m128i*)pd);
1890     cache_prefetch ((__m128i*)pm);
1891
1892     while (w && (unsigned long)pd & 15)
1893     {
1894         s = *ps++;
1895         m = *pm++;
1896         d = *pd;
1897
1898         *pd++ = pack_1x64_32 (
1899             pix_multiply_1x64 (
1900                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1901                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1902
1903         w--;
1904     }
1905
1906     /* call prefetch hint to optimize cache load*/
1907     cache_prefetch ((__m128i*)ps);
1908     cache_prefetch ((__m128i*)pd);
1909     cache_prefetch ((__m128i*)pm);
1910
1911     while (w >= 4)
1912     {
1913         /* fill cache line with next memory */
1914         cache_prefetch_next ((__m128i*)ps);
1915         cache_prefetch_next ((__m128i*)pd);
1916         cache_prefetch_next ((__m128i*)pm);
1917
1918         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1919         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1920         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1921
1922         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1923         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1924         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1925
1926         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1927                             &xmm_alpha_lo, &xmm_alpha_hi);
1928
1929         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1930                             &xmm_mask_lo, &xmm_mask_hi,
1931                             &xmm_dst_lo, &xmm_dst_hi);
1932
1933         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1934                             &xmm_alpha_lo, &xmm_alpha_hi,
1935                             &xmm_dst_lo, &xmm_dst_hi);
1936
1937         save_128_aligned (
1938             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1939
1940         ps += 4;
1941         pd += 4;
1942         pm += 4;
1943         w -= 4;
1944     }
1945
1946     while (w)
1947     {
1948         s = *ps++;
1949         m = *pm++;
1950         d = *pd;
1951
1952         *pd++ = pack_1x64_32 (
1953             pix_multiply_1x64 (
1954                 pix_multiply_1x64 (
1955                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1956                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1957
1958         w--;
1959     }
1960 }
1961
1962 static force_inline void
1963 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1964                                  const uint32_t *ps,
1965                                  const uint32_t *pm,
1966                                  int             w)
1967 {
1968     uint32_t s, m, d;
1969
1970     __m128i xmm_alpha_lo, xmm_alpha_hi;
1971     __m128i xmm_src_lo, xmm_src_hi;
1972     __m128i xmm_dst_lo, xmm_dst_hi;
1973     __m128i xmm_mask_lo, xmm_mask_hi;
1974
1975     /* call prefetch hint to optimize cache load*/
1976     cache_prefetch ((__m128i*)ps);
1977     cache_prefetch ((__m128i*)pd);
1978     cache_prefetch ((__m128i*)pm);
1979
1980     while (w && (unsigned long)pd & 15)
1981     {
1982         s = *ps++;
1983         m = *pm++;
1984         d = *pd;
1985
1986         *pd++ = pack_1x64_32 (
1987             pix_multiply_1x64 (
1988                 unpack_32_1x64 (d),
1989                 pix_multiply_1x64 (unpack_32_1x64 (m),
1990                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1991         w--;
1992     }
1993
1994     /* call prefetch hint to optimize cache load*/
1995     cache_prefetch ((__m128i*)ps);
1996     cache_prefetch ((__m128i*)pd);
1997     cache_prefetch ((__m128i*)pm);
1998
1999     while (w >= 4)
2000     {
2001         /* fill cache line with next memory */
2002         cache_prefetch_next ((__m128i*)ps);
2003         cache_prefetch_next ((__m128i*)pd);
2004         cache_prefetch_next ((__m128i*)pm);
2005
2006         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2007         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2008         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2009
2010         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2011         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2012         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2013
2014         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2015                             &xmm_alpha_lo, &xmm_alpha_hi);
2016         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2017                             &xmm_alpha_lo, &xmm_alpha_hi,
2018                             &xmm_alpha_lo, &xmm_alpha_hi);
2019
2020         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2021                             &xmm_alpha_lo, &xmm_alpha_hi,
2022                             &xmm_dst_lo, &xmm_dst_hi);
2023
2024         save_128_aligned (
2025             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2026
2027         ps += 4;
2028         pd += 4;
2029         pm += 4;
2030         w -= 4;
2031     }
2032
2033     while (w)
2034     {
2035         s = *ps++;
2036         m = *pm++;
2037         d = *pd;
2038
2039         *pd++ = pack_1x64_32 (
2040             pix_multiply_1x64 (
2041                 unpack_32_1x64 (d),
2042                 pix_multiply_1x64 (unpack_32_1x64 (m),
2043                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
2044         w--;
2045     }
2046 }
2047
2048 static force_inline void
2049 core_combine_out_ca_sse2 (uint32_t *      pd,
2050                           const uint32_t *ps,
2051                           const uint32_t *pm,
2052                           int             w)
2053 {
2054     uint32_t s, m, d;
2055
2056     __m128i xmm_alpha_lo, xmm_alpha_hi;
2057     __m128i xmm_src_lo, xmm_src_hi;
2058     __m128i xmm_dst_lo, xmm_dst_hi;
2059     __m128i xmm_mask_lo, xmm_mask_hi;
2060
2061     /* call prefetch hint to optimize cache load*/
2062     cache_prefetch ((__m128i*)ps);
2063     cache_prefetch ((__m128i*)pd);
2064     cache_prefetch ((__m128i*)pm);
2065
2066     while (w && (unsigned long)pd & 15)
2067     {
2068         s = *ps++;
2069         m = *pm++;
2070         d = *pd;
2071
2072         *pd++ = pack_1x64_32 (
2073             pix_multiply_1x64 (
2074                 pix_multiply_1x64 (
2075                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2076                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2077         w--;
2078     }
2079
2080     /* call prefetch hint to optimize cache load*/
2081     cache_prefetch ((__m128i*)ps);
2082     cache_prefetch ((__m128i*)pd);
2083     cache_prefetch ((__m128i*)pm);
2084
2085     while (w >= 4)
2086     {
2087         /* fill cache line with next memory */
2088         cache_prefetch_next ((__m128i*)ps);
2089         cache_prefetch_next ((__m128i*)pd);
2090         cache_prefetch_next ((__m128i*)pm);
2091
2092         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2093         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2094         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2095
2096         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2097         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2098         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2099
2100         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2101                             &xmm_alpha_lo, &xmm_alpha_hi);
2102         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2103                       &xmm_alpha_lo, &xmm_alpha_hi);
2104
2105         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2106                             &xmm_mask_lo, &xmm_mask_hi,
2107                             &xmm_dst_lo, &xmm_dst_hi);
2108         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2109                             &xmm_alpha_lo, &xmm_alpha_hi,
2110                             &xmm_dst_lo, &xmm_dst_hi);
2111
2112         save_128_aligned (
2113             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2114
2115         ps += 4;
2116         pd += 4;
2117         pm += 4;
2118         w -= 4;
2119     }
2120
2121     while (w)
2122     {
2123         s = *ps++;
2124         m = *pm++;
2125         d = *pd;
2126
2127         *pd++ = pack_1x64_32 (
2128             pix_multiply_1x64 (
2129                 pix_multiply_1x64 (
2130                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2131                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2132
2133         w--;
2134     }
2135 }
2136
2137 static force_inline void
2138 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
2139                                   const uint32_t *ps,
2140                                   const uint32_t *pm,
2141                                   int             w)
2142 {
2143     uint32_t s, m, d;
2144
2145     __m128i xmm_alpha_lo, xmm_alpha_hi;
2146     __m128i xmm_src_lo, xmm_src_hi;
2147     __m128i xmm_dst_lo, xmm_dst_hi;
2148     __m128i xmm_mask_lo, xmm_mask_hi;
2149
2150     /* call prefetch hint to optimize cache load*/
2151     cache_prefetch ((__m128i*)ps);
2152     cache_prefetch ((__m128i*)pd);
2153     cache_prefetch ((__m128i*)pm);
2154
2155     while (w && (unsigned long)pd & 15)
2156     {
2157         s = *ps++;
2158         m = *pm++;
2159         d = *pd;
2160
2161         *pd++ = pack_1x64_32 (
2162             pix_multiply_1x64 (
2163                 unpack_32_1x64 (d),
2164                 negate_1x64 (pix_multiply_1x64 (
2165                                  unpack_32_1x64 (m),
2166                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2167         w--;
2168     }
2169
2170     /* call prefetch hint to optimize cache load*/
2171     cache_prefetch ((__m128i*)ps);
2172     cache_prefetch ((__m128i*)pd);
2173     cache_prefetch ((__m128i*)pm);
2174
2175     while (w >= 4)
2176     {
2177         /* fill cache line with next memory */
2178         cache_prefetch_next ((__m128i*)ps);
2179         cache_prefetch_next ((__m128i*)pd);
2180         cache_prefetch_next ((__m128i*)pm);
2181
2182         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2183         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2184         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2185
2186         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2187         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2188         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2189
2190         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2191                             &xmm_alpha_lo, &xmm_alpha_hi);
2192
2193         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2194                             &xmm_alpha_lo, &xmm_alpha_hi,
2195                             &xmm_mask_lo, &xmm_mask_hi);
2196
2197         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2198                       &xmm_mask_lo, &xmm_mask_hi);
2199
2200         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2201                             &xmm_mask_lo, &xmm_mask_hi,
2202                             &xmm_dst_lo, &xmm_dst_hi);
2203
2204         save_128_aligned (
2205             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2206
2207         ps += 4;
2208         pd += 4;
2209         pm += 4;
2210         w -= 4;
2211     }
2212
2213     while (w)
2214     {
2215         s = *ps++;
2216         m = *pm++;
2217         d = *pd;
2218
2219         *pd++ = pack_1x64_32 (
2220             pix_multiply_1x64 (
2221                 unpack_32_1x64 (d),
2222                 negate_1x64 (pix_multiply_1x64 (
2223                                  unpack_32_1x64 (m),
2224                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2225         w--;
2226     }
2227 }
2228
2229 static force_inline uint32_t
2230 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2231                                  uint32_t mask,
2232                                  uint32_t dst)
2233 {
2234     __m64 m = unpack_32_1x64 (mask);
2235     __m64 s = unpack_32_1x64 (src);
2236     __m64 d = unpack_32_1x64 (dst);
2237     __m64 sa = expand_alpha_1x64 (s);
2238     __m64 da = expand_alpha_1x64 (d);
2239
2240     s = pix_multiply_1x64 (s, m);
2241     m = negate_1x64 (pix_multiply_1x64 (m, sa));
2242
2243     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2244 }
2245
2246 static force_inline void
2247 core_combine_atop_ca_sse2 (uint32_t *      pd,
2248                            const uint32_t *ps,
2249                            const uint32_t *pm,
2250                            int             w)
2251 {
2252     uint32_t s, m, d;
2253
2254     __m128i xmm_src_lo, xmm_src_hi;
2255     __m128i xmm_dst_lo, xmm_dst_hi;
2256     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2257     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2258     __m128i xmm_mask_lo, xmm_mask_hi;
2259
2260     /* call prefetch hint to optimize cache load*/
2261     cache_prefetch ((__m128i*)ps);
2262     cache_prefetch ((__m128i*)pd);
2263     cache_prefetch ((__m128i*)pm);
2264
2265     while (w && (unsigned long)pd & 15)
2266     {
2267         s = *ps++;
2268         m = *pm++;
2269         d = *pd;
2270
2271         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2272         w--;
2273     }
2274
2275     /* call prefetch hint to optimize cache load*/
2276     cache_prefetch ((__m128i*)ps);
2277     cache_prefetch ((__m128i*)pd);
2278     cache_prefetch ((__m128i*)pm);
2279
2280     while (w >= 4)
2281     {
2282         /* fill cache line with next memory */
2283         cache_prefetch_next ((__m128i*)ps);
2284         cache_prefetch_next ((__m128i*)pd);
2285         cache_prefetch_next ((__m128i*)pm);
2286
2287         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2288         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2289         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2290
2291         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2292         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2293         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2294
2295         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2296                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2297         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2298                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2299
2300         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2301                             &xmm_mask_lo, &xmm_mask_hi,
2302                             &xmm_src_lo, &xmm_src_hi);
2303         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2304                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2305                             &xmm_mask_lo, &xmm_mask_hi);
2306
2307         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2308
2309         pix_add_multiply_2x128 (
2310             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2311             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2312             &xmm_dst_lo, &xmm_dst_hi);
2313
2314         save_128_aligned (
2315             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2316
2317         ps += 4;
2318         pd += 4;
2319         pm += 4;
2320         w -= 4;
2321     }
2322
2323     while (w)
2324     {
2325         s = *ps++;
2326         m = *pm++;
2327         d = *pd;
2328
2329         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2330         w--;
2331     }
2332 }
2333
2334 static force_inline uint32_t
2335 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2336                                          uint32_t mask,
2337                                          uint32_t dst)
2338 {
2339     __m64 m = unpack_32_1x64 (mask);
2340     __m64 s = unpack_32_1x64 (src);
2341     __m64 d = unpack_32_1x64 (dst);
2342
2343     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2344     __m64 sa = expand_alpha_1x64 (s);
2345
2346     s = pix_multiply_1x64 (s, m);
2347     m = pix_multiply_1x64 (m, sa);
2348
2349     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2350 }
2351
2352 static force_inline void
2353 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2354                                    const uint32_t *ps,
2355                                    const uint32_t *pm,
2356                                    int             w)
2357 {
2358     uint32_t s, m, d;
2359
2360     __m128i xmm_src_lo, xmm_src_hi;
2361     __m128i xmm_dst_lo, xmm_dst_hi;
2362     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2363     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2364     __m128i xmm_mask_lo, xmm_mask_hi;
2365
2366     /* call prefetch hint to optimize cache load*/
2367     cache_prefetch ((__m128i*)ps);
2368     cache_prefetch ((__m128i*)pd);
2369     cache_prefetch ((__m128i*)pm);
2370
2371     while (w && (unsigned long)pd & 15)
2372     {
2373         s = *ps++;
2374         m = *pm++;
2375         d = *pd;
2376
2377         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2378         w--;
2379     }
2380
2381     /* call prefetch hint to optimize cache load*/
2382     cache_prefetch ((__m128i*)ps);
2383     cache_prefetch ((__m128i*)pd);
2384     cache_prefetch ((__m128i*)pm);
2385
2386     while (w >= 4)
2387     {
2388         /* fill cache line with next memory */
2389         cache_prefetch_next ((__m128i*)ps);
2390         cache_prefetch_next ((__m128i*)pd);
2391         cache_prefetch_next ((__m128i*)pm);
2392
2393         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2394         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2395         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2396
2397         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2398         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2399         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2400
2401         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2402                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2403         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2404                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2405
2406         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2407                             &xmm_mask_lo, &xmm_mask_hi,
2408                             &xmm_src_lo, &xmm_src_hi);
2409         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2410                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2411                             &xmm_mask_lo, &xmm_mask_hi);
2412
2413         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2414                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2415
2416         pix_add_multiply_2x128 (
2417             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2418             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2419             &xmm_dst_lo, &xmm_dst_hi);
2420
2421         save_128_aligned (
2422             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2423
2424         ps += 4;
2425         pd += 4;
2426         pm += 4;
2427         w -= 4;
2428     }
2429
2430     while (w)
2431     {
2432         s = *ps++;
2433         m = *pm++;
2434         d = *pd;
2435
2436         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2437         w--;
2438     }
2439 }
2440
2441 static force_inline uint32_t
2442 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2443                                 uint32_t mask,
2444                                 uint32_t dst)
2445 {
2446     __m64 a = unpack_32_1x64 (mask);
2447     __m64 s = unpack_32_1x64 (src);
2448     __m64 d = unpack_32_1x64 (dst);
2449
2450     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2451                                        a, expand_alpha_1x64 (s)));
2452     __m64 dest      = pix_multiply_1x64 (s, a);
2453     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2454
2455     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2456                                                 &alpha_dst,
2457                                                 &dest,
2458                                                 &alpha_src));
2459 }
2460
2461 static force_inline void
2462 core_combine_xor_ca_sse2 (uint32_t *      pd,
2463                           const uint32_t *ps,
2464                           const uint32_t *pm,
2465                           int             w)
2466 {
2467     uint32_t s, m, d;
2468
2469     __m128i xmm_src_lo, xmm_src_hi;
2470     __m128i xmm_dst_lo, xmm_dst_hi;
2471     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2472     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2473     __m128i xmm_mask_lo, xmm_mask_hi;
2474
2475     /* call prefetch hint to optimize cache load*/
2476     cache_prefetch ((__m128i*)ps);
2477     cache_prefetch ((__m128i*)pd);
2478     cache_prefetch ((__m128i*)pm);
2479
2480     while (w && (unsigned long)pd & 15)
2481     {
2482         s = *ps++;
2483         m = *pm++;
2484         d = *pd;
2485
2486         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2487         w--;
2488     }
2489
2490     /* call prefetch hint to optimize cache load*/
2491     cache_prefetch ((__m128i*)ps);
2492     cache_prefetch ((__m128i*)pd);
2493     cache_prefetch ((__m128i*)pm);
2494
2495     while (w >= 4)
2496     {
2497         /* fill cache line with next memory */
2498         cache_prefetch_next ((__m128i*)ps);
2499         cache_prefetch_next ((__m128i*)pd);
2500         cache_prefetch_next ((__m128i*)pm);
2501
2502         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2503         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2504         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2505
2506         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2507         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2508         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2509
2510         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2511                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2512         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2513                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2514
2515         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2516                             &xmm_mask_lo, &xmm_mask_hi,
2517                             &xmm_src_lo, &xmm_src_hi);
2518         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2519                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2520                             &xmm_mask_lo, &xmm_mask_hi);
2521
2522         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2523                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2524         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2525                       &xmm_mask_lo, &xmm_mask_hi);
2526
2527         pix_add_multiply_2x128 (
2528             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2529             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2530             &xmm_dst_lo, &xmm_dst_hi);
2531
2532         save_128_aligned (
2533             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2534
2535         ps += 4;
2536         pd += 4;
2537         pm += 4;
2538         w -= 4;
2539     }
2540
2541     while (w)
2542     {
2543         s = *ps++;
2544         m = *pm++;
2545         d = *pd;
2546
2547         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2548         w--;
2549     }
2550 }
2551
2552 static force_inline void
2553 core_combine_add_ca_sse2 (uint32_t *      pd,
2554                           const uint32_t *ps,
2555                           const uint32_t *pm,
2556                           int             w)
2557 {
2558     uint32_t s, m, d;
2559
2560     __m128i xmm_src_lo, xmm_src_hi;
2561     __m128i xmm_dst_lo, xmm_dst_hi;
2562     __m128i xmm_mask_lo, xmm_mask_hi;
2563
2564     /* call prefetch hint to optimize cache load*/
2565     cache_prefetch ((__m128i*)ps);
2566     cache_prefetch ((__m128i*)pd);
2567     cache_prefetch ((__m128i*)pm);
2568
2569     while (w && (unsigned long)pd & 15)
2570     {
2571         s = *ps++;
2572         m = *pm++;
2573         d = *pd;
2574
2575         *pd++ = pack_1x64_32 (
2576             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2577                                              unpack_32_1x64 (m)),
2578                           unpack_32_1x64 (d)));
2579         w--;
2580     }
2581
2582     /* call prefetch hint to optimize cache load*/
2583     cache_prefetch ((__m128i*)ps);
2584     cache_prefetch ((__m128i*)pd);
2585     cache_prefetch ((__m128i*)pm);
2586
2587     while (w >= 4)
2588     {
2589         /* fill cache line with next memory */
2590         cache_prefetch_next ((__m128i*)ps);
2591         cache_prefetch_next ((__m128i*)pd);
2592         cache_prefetch_next ((__m128i*)pm);
2593
2594         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2595         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2596         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2597
2598         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2599         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2600         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2601
2602         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2603                             &xmm_mask_lo, &xmm_mask_hi,
2604                             &xmm_src_lo, &xmm_src_hi);
2605
2606         save_128_aligned (
2607             (__m128i*)pd, pack_2x128_128 (
2608                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2609                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2610
2611         ps += 4;
2612         pd += 4;
2613         pm += 4;
2614         w -= 4;
2615     }
2616
2617     while (w)
2618     {
2619         s = *ps++;
2620         m = *pm++;
2621         d = *pd;
2622
2623         *pd++ = pack_1x64_32 (
2624             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2625                                              unpack_32_1x64 (m)),
2626                           unpack_32_1x64 (d)));
2627         w--;
2628     }
2629 }
2630
2631 /* ---------------------------------------------------
2632  * fb_compose_setup_sSE2
2633  */
2634 static force_inline __m64
2635 create_mask_16_64 (uint16_t mask)
2636 {
2637     return _mm_set1_pi16 (mask);
2638 }
2639
2640 static force_inline __m128i
2641 create_mask_16_128 (uint16_t mask)
2642 {
2643     return _mm_set1_epi16 (mask);
2644 }
2645
2646 static force_inline __m64
2647 create_mask_2x32_64 (uint32_t mask0,
2648                      uint32_t mask1)
2649 {
2650     return _mm_set_pi32 (mask0, mask1);
2651 }
2652
2653 /* Work around a code generation bug in Sun Studio 12. */
2654 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2655 # define create_mask_2x32_128(mask0, mask1)                             \
2656     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2657 #else
2658 static force_inline __m128i
2659 create_mask_2x32_128 (uint32_t mask0,
2660                       uint32_t mask1)
2661 {
2662     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2663 }
2664 #endif
2665
2666 /* SSE2 code patch for fbcompose.c */
2667
2668 static void
2669 sse2_combine_over_u (pixman_implementation_t *imp,
2670                      pixman_op_t              op,
2671                      uint32_t *               dst,
2672                      const uint32_t *         src,
2673                      const uint32_t *         mask,
2674                      int                      width)
2675 {
2676     core_combine_over_u_sse2 (dst, src, mask, width);
2677     _mm_empty ();
2678 }
2679
2680 static void
2681 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2682                              pixman_op_t              op,
2683                              uint32_t *               dst,
2684                              const uint32_t *         src,
2685                              const uint32_t *         mask,
2686                              int                      width)
2687 {
2688     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2689     _mm_empty ();
2690 }
2691
2692 static void
2693 sse2_combine_in_u (pixman_implementation_t *imp,
2694                    pixman_op_t              op,
2695                    uint32_t *               dst,
2696                    const uint32_t *         src,
2697                    const uint32_t *         mask,
2698                    int                      width)
2699 {
2700     core_combine_in_u_sse2 (dst, src, mask, width);
2701     _mm_empty ();
2702 }
2703
2704 static void
2705 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2706                            pixman_op_t              op,
2707                            uint32_t *               dst,
2708                            const uint32_t *         src,
2709                            const uint32_t *         mask,
2710                            int                      width)
2711 {
2712     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2713     _mm_empty ();
2714 }
2715
2716 static void
2717 sse2_combine_out_u (pixman_implementation_t *imp,
2718                     pixman_op_t              op,
2719                     uint32_t *               dst,
2720                     const uint32_t *         src,
2721                     const uint32_t *         mask,
2722                     int                      width)
2723 {
2724     core_combine_out_u_sse2 (dst, src, mask, width);
2725     _mm_empty ();
2726 }
2727
2728 static void
2729 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2730                             pixman_op_t              op,
2731                             uint32_t *               dst,
2732                             const uint32_t *         src,
2733                             const uint32_t *         mask,
2734                             int                      width)
2735 {
2736     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2737     _mm_empty ();
2738 }
2739
2740 static void
2741 sse2_combine_atop_u (pixman_implementation_t *imp,
2742                      pixman_op_t              op,
2743                      uint32_t *               dst,
2744                      const uint32_t *         src,
2745                      const uint32_t *         mask,
2746                      int                      width)
2747 {
2748     core_combine_atop_u_sse2 (dst, src, mask, width);
2749     _mm_empty ();
2750 }
2751
2752 static void
2753 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2754                              pixman_op_t              op,
2755                              uint32_t *               dst,
2756                              const uint32_t *         src,
2757                              const uint32_t *         mask,
2758                              int                      width)
2759 {
2760     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2761     _mm_empty ();
2762 }
2763
2764 static void
2765 sse2_combine_xor_u (pixman_implementation_t *imp,
2766                     pixman_op_t              op,
2767                     uint32_t *               dst,
2768                     const uint32_t *         src,
2769                     const uint32_t *         mask,
2770                     int                      width)
2771 {
2772     core_combine_xor_u_sse2 (dst, src, mask, width);
2773     _mm_empty ();
2774 }
2775
2776 static void
2777 sse2_combine_add_u (pixman_implementation_t *imp,
2778                     pixman_op_t              op,
2779                     uint32_t *               dst,
2780                     const uint32_t *         src,
2781                     const uint32_t *         mask,
2782                     int                      width)
2783 {
2784     core_combine_add_u_sse2 (dst, src, mask, width);
2785     _mm_empty ();
2786 }
2787
2788 static void
2789 sse2_combine_saturate_u (pixman_implementation_t *imp,
2790                          pixman_op_t              op,
2791                          uint32_t *               dst,
2792                          const uint32_t *         src,
2793                          const uint32_t *         mask,
2794                          int                      width)
2795 {
2796     core_combine_saturate_u_sse2 (dst, src, mask, width);
2797     _mm_empty ();
2798 }
2799
2800 static void
2801 sse2_combine_src_ca (pixman_implementation_t *imp,
2802                      pixman_op_t              op,
2803                      uint32_t *               dst,
2804                      const uint32_t *         src,
2805                      const uint32_t *         mask,
2806                      int                      width)
2807 {
2808     core_combine_src_ca_sse2 (dst, src, mask, width);
2809     _mm_empty ();
2810 }
2811
2812 static void
2813 sse2_combine_over_ca (pixman_implementation_t *imp,
2814                       pixman_op_t              op,
2815                       uint32_t *               dst,
2816                       const uint32_t *         src,
2817                       const uint32_t *         mask,
2818                       int                      width)
2819 {
2820     core_combine_over_ca_sse2 (dst, src, mask, width);
2821     _mm_empty ();
2822 }
2823
2824 static void
2825 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2826                               pixman_op_t              op,
2827                               uint32_t *               dst,
2828                               const uint32_t *         src,
2829                               const uint32_t *         mask,
2830                               int                      width)
2831 {
2832     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2833     _mm_empty ();
2834 }
2835
2836 static void
2837 sse2_combine_in_ca (pixman_implementation_t *imp,
2838                     pixman_op_t              op,
2839                     uint32_t *               dst,
2840                     const uint32_t *         src,
2841                     const uint32_t *         mask,
2842                     int                      width)
2843 {
2844     core_combine_in_ca_sse2 (dst, src, mask, width);
2845     _mm_empty ();
2846 }
2847
2848 static void
2849 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2850                             pixman_op_t              op,
2851                             uint32_t *               dst,
2852                             const uint32_t *         src,
2853                             const uint32_t *         mask,
2854                             int                      width)
2855 {
2856     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2857     _mm_empty ();
2858 }
2859
2860 static void
2861 sse2_combine_out_ca (pixman_implementation_t *imp,
2862                      pixman_op_t              op,
2863                      uint32_t *               dst,
2864                      const uint32_t *         src,
2865                      const uint32_t *         mask,
2866                      int                      width)
2867 {
2868     core_combine_out_ca_sse2 (dst, src, mask, width);
2869     _mm_empty ();
2870 }
2871
2872 static void
2873 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2874                              pixman_op_t              op,
2875                              uint32_t *               dst,
2876                              const uint32_t *         src,
2877                              const uint32_t *         mask,
2878                              int                      width)
2879 {
2880     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2881     _mm_empty ();
2882 }
2883
2884 static void
2885 sse2_combine_atop_ca (pixman_implementation_t *imp,
2886                       pixman_op_t              op,
2887                       uint32_t *               dst,
2888                       const uint32_t *         src,
2889                       const uint32_t *         mask,
2890                       int                      width)
2891 {
2892     core_combine_atop_ca_sse2 (dst, src, mask, width);
2893     _mm_empty ();
2894 }
2895
2896 static void
2897 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2898                               pixman_op_t              op,
2899                               uint32_t *               dst,
2900                               const uint32_t *         src,
2901                               const uint32_t *         mask,
2902                               int                      width)
2903 {
2904     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2905     _mm_empty ();
2906 }
2907
2908 static void
2909 sse2_combine_xor_ca (pixman_implementation_t *imp,
2910                      pixman_op_t              op,
2911                      uint32_t *               dst,
2912                      const uint32_t *         src,
2913                      const uint32_t *         mask,
2914                      int                      width)
2915 {
2916     core_combine_xor_ca_sse2 (dst, src, mask, width);
2917     _mm_empty ();
2918 }
2919
2920 static void
2921 sse2_combine_add_ca (pixman_implementation_t *imp,
2922                      pixman_op_t              op,
2923                      uint32_t *               dst,
2924                      const uint32_t *         src,
2925                      const uint32_t *         mask,
2926                      int                      width)
2927 {
2928     core_combine_add_ca_sse2 (dst, src, mask, width);
2929     _mm_empty ();
2930 }
2931
2932 /* -------------------------------------------------------------------
2933  * composite_over_n_8888
2934  */
2935
2936 static void
2937 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2938                             pixman_op_t              op,
2939                             pixman_image_t *         src_image,
2940                             pixman_image_t *         mask_image,
2941                             pixman_image_t *         dst_image,
2942                             int32_t                  src_x,
2943                             int32_t                  src_y,
2944                             int32_t                  mask_x,
2945                             int32_t                  mask_y,
2946                             int32_t                  dest_x,
2947                             int32_t                  dest_y,
2948                             int32_t                  width,
2949                             int32_t                  height)
2950 {
2951     uint32_t src;
2952     uint32_t    *dst_line, *dst, d;
2953     int32_t w;
2954     int dst_stride;
2955     __m128i xmm_src, xmm_alpha;
2956     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2957
2958     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2959
2960     if (src == 0)
2961         return;
2962
2963     PIXMAN_IMAGE_GET_LINE (
2964         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2965
2966     xmm_src = expand_pixel_32_1x128 (src);
2967     xmm_alpha = expand_alpha_1x128 (xmm_src);
2968
2969     while (height--)
2970     {
2971         dst = dst_line;
2972
2973         /* call prefetch hint to optimize cache load*/
2974         cache_prefetch ((__m128i*)dst);
2975
2976         dst_line += dst_stride;
2977         w = width;
2978
2979         while (w && (unsigned long)dst & 15)
2980         {
2981             d = *dst;
2982             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2983                                               _mm_movepi64_pi64 (xmm_alpha),
2984                                               unpack_32_1x64 (d)));
2985             w--;
2986         }
2987
2988         cache_prefetch ((__m128i*)dst);
2989
2990         while (w >= 4)
2991         {
2992             /* fill cache line with next memory */
2993             cache_prefetch_next ((__m128i*)dst);
2994
2995             xmm_dst = load_128_aligned ((__m128i*)dst);
2996
2997             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2998
2999             over_2x128 (&xmm_src, &xmm_src,
3000                         &xmm_alpha, &xmm_alpha,
3001                         &xmm_dst_lo, &xmm_dst_hi);
3002
3003             /* rebuid the 4 pixel data and save*/
3004             save_128_aligned (
3005                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3006
3007             w -= 4;
3008             dst += 4;
3009         }
3010
3011         while (w)
3012         {
3013             d = *dst;
3014             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3015                                               _mm_movepi64_pi64 (xmm_alpha),
3016                                               unpack_32_1x64 (d)));
3017             w--;
3018         }
3019
3020     }
3021     _mm_empty ();
3022 }
3023
3024 /* ---------------------------------------------------------------------
3025  * composite_over_n_0565
3026  */
3027 static void
3028 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3029                             pixman_op_t              op,
3030                             pixman_image_t *         src_image,
3031                             pixman_image_t *         mask_image,
3032                             pixman_image_t *         dst_image,
3033                             int32_t                  src_x,
3034                             int32_t                  src_y,
3035                             int32_t                  mask_x,
3036                             int32_t                  mask_y,
3037                             int32_t                  dest_x,
3038                             int32_t                  dest_y,
3039                             int32_t                  width,
3040                             int32_t                  height)
3041 {
3042     uint32_t src;
3043     uint16_t    *dst_line, *dst, d;
3044     int32_t w;
3045     int dst_stride;
3046     __m128i xmm_src, xmm_alpha;
3047     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3048
3049     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3050
3051     if (src == 0)
3052         return;
3053
3054     PIXMAN_IMAGE_GET_LINE (
3055         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3056
3057     xmm_src = expand_pixel_32_1x128 (src);
3058     xmm_alpha = expand_alpha_1x128 (xmm_src);
3059
3060     while (height--)
3061     {
3062         dst = dst_line;
3063
3064         /* call prefetch hint to optimize cache load*/
3065         cache_prefetch ((__m128i*)dst);
3066
3067         dst_line += dst_stride;
3068         w = width;
3069
3070         while (w && (unsigned long)dst & 15)
3071         {
3072             d = *dst;
3073
3074             *dst++ = pack_565_32_16 (
3075                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3076                                          _mm_movepi64_pi64 (xmm_alpha),
3077                                          expand565_16_1x64 (d))));
3078             w--;
3079         }
3080
3081         /* call prefetch hint to optimize cache load*/
3082         cache_prefetch ((__m128i*)dst);
3083
3084         while (w >= 8)
3085         {
3086             /* fill cache line with next memory */
3087             cache_prefetch_next ((__m128i*)dst);
3088
3089             xmm_dst = load_128_aligned ((__m128i*)dst);
3090
3091             unpack_565_128_4x128 (xmm_dst,
3092                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3093
3094             over_2x128 (&xmm_src, &xmm_src,
3095                         &xmm_alpha, &xmm_alpha,
3096                         &xmm_dst0, &xmm_dst1);
3097             over_2x128 (&xmm_src, &xmm_src,
3098                         &xmm_alpha, &xmm_alpha,
3099                         &xmm_dst2, &xmm_dst3);
3100
3101             xmm_dst = pack_565_4x128_128 (
3102                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3103
3104             save_128_aligned ((__m128i*)dst, xmm_dst);
3105
3106             dst += 8;
3107             w -= 8;
3108         }
3109
3110         while (w--)
3111         {
3112             d = *dst;
3113             *dst++ = pack_565_32_16 (
3114                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3115                                          _mm_movepi64_pi64 (xmm_alpha),
3116                                          expand565_16_1x64 (d))));
3117         }
3118     }
3119
3120     _mm_empty ();
3121 }
3122
3123 /* ------------------------------
3124  * composite_add_n_8888_8888_ca
3125  */
3126 static void
3127 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3128                                    pixman_op_t              op,
3129                                    pixman_image_t *         src_image,
3130                                    pixman_image_t *         mask_image,
3131                                    pixman_image_t *         dst_image,
3132                                    int32_t                  src_x,
3133                                    int32_t                  src_y,
3134                                    int32_t                  mask_x,
3135                                    int32_t                  mask_y,
3136                                    int32_t                  dest_x,
3137                                    int32_t                  dest_y,
3138                                    int32_t                  width,
3139                                    int32_t                  height)
3140 {
3141     uint32_t src, srca;
3142     uint32_t    *dst_line, d;
3143     uint32_t    *mask_line, m;
3144     uint32_t pack_cmp;
3145     int dst_stride, mask_stride;
3146
3147     __m128i xmm_src, xmm_alpha;
3148     __m128i xmm_dst;
3149     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3150
3151     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3152
3153     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3154     srca = src >> 24;
3155
3156     if (src == 0)
3157         return;
3158
3159     PIXMAN_IMAGE_GET_LINE (
3160         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3161     PIXMAN_IMAGE_GET_LINE (
3162         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3163
3164     xmm_src = _mm_unpacklo_epi8 (
3165         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3166     xmm_alpha = expand_alpha_1x128 (xmm_src);
3167     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3168     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3169
3170     while (height--)
3171     {
3172         int w = width;
3173         const uint32_t *pm = (uint32_t *)mask_line;
3174         uint32_t *pd = (uint32_t *)dst_line;
3175
3176         dst_line += dst_stride;
3177         mask_line += mask_stride;
3178
3179         /* call prefetch hint to optimize cache load*/
3180         cache_prefetch ((__m128i*)pd);
3181         cache_prefetch ((__m128i*)pm);
3182
3183         while (w && (unsigned long)pd & 15)
3184         {
3185             m = *pm++;
3186
3187             if (m)
3188             {
3189                 d = *pd;
3190
3191                 mmx_mask = unpack_32_1x64 (m);
3192                 mmx_dest = unpack_32_1x64 (d);
3193
3194                 *pd = pack_1x64_32 (
3195                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3196             }
3197
3198             pd++;
3199             w--;
3200         }
3201
3202         /* call prefetch hint to optimize cache load*/
3203         cache_prefetch ((__m128i*)pd);
3204         cache_prefetch ((__m128i*)pm);
3205
3206         while (w >= 4)
3207         {
3208             /* fill cache line with next memory */
3209             cache_prefetch_next ((__m128i*)pd);
3210             cache_prefetch_next ((__m128i*)pm);
3211
3212             xmm_mask = load_128_unaligned ((__m128i*)pm);
3213
3214             pack_cmp =
3215                 _mm_movemask_epi8 (
3216                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3217
3218             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3219             if (pack_cmp != 0xffff)
3220             {
3221                 xmm_dst = load_128_aligned ((__m128i*)pd);
3222
3223                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3224
3225                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3226                                     &xmm_mask_lo, &xmm_mask_hi,
3227                                     &xmm_mask_lo, &xmm_mask_hi);
3228                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3229
3230                 save_128_aligned (
3231                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3232             }
3233
3234             pd += 4;
3235             pm += 4;
3236             w -= 4;
3237         }
3238
3239         while (w)
3240         {
3241             m = *pm++;
3242
3243             if (m)
3244             {
3245                 d = *pd;
3246
3247                 mmx_mask = unpack_32_1x64 (m);
3248                 mmx_dest = unpack_32_1x64 (d);
3249
3250                 *pd = pack_1x64_32 (
3251                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3252             }
3253
3254             pd++;
3255             w--;
3256         }
3257     }
3258
3259     _mm_empty ();
3260 }
3261
3262 /* ---------------------------------------------------------------------------
3263  * composite_over_n_8888_8888_ca
3264  */
3265
3266 static void
3267 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3268                                     pixman_op_t              op,
3269                                     pixman_image_t *         src_image,
3270                                     pixman_image_t *         mask_image,
3271                                     pixman_image_t *         dst_image,
3272                                     int32_t                  src_x,
3273                                     int32_t                  src_y,
3274                                     int32_t                  mask_x,
3275                                     int32_t                  mask_y,
3276                                     int32_t                  dest_x,
3277                                     int32_t                  dest_y,
3278                                     int32_t                  width,
3279                                     int32_t                  height)
3280 {
3281     uint32_t src;
3282     uint32_t    *dst_line, d;
3283     uint32_t    *mask_line, m;
3284     uint32_t pack_cmp;
3285     int dst_stride, mask_stride;
3286
3287     __m128i xmm_src, xmm_alpha;
3288     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3289     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3290
3291     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3292
3293     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3294
3295     if (src == 0)
3296         return;
3297
3298     PIXMAN_IMAGE_GET_LINE (
3299         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3300     PIXMAN_IMAGE_GET_LINE (
3301         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3302
3303     xmm_src = _mm_unpacklo_epi8 (
3304         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3305     xmm_alpha = expand_alpha_1x128 (xmm_src);
3306     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3307     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3308
3309     while (height--)
3310     {
3311         int w = width;
3312         const uint32_t *pm = (uint32_t *)mask_line;
3313         uint32_t *pd = (uint32_t *)dst_line;
3314
3315         dst_line += dst_stride;
3316         mask_line += mask_stride;
3317
3318         /* call prefetch hint to optimize cache load*/
3319         cache_prefetch ((__m128i*)pd);
3320         cache_prefetch ((__m128i*)pm);
3321
3322         while (w && (unsigned long)pd & 15)
3323         {
3324             m = *pm++;
3325
3326             if (m)
3327             {
3328                 d = *pd;
3329                 mmx_mask = unpack_32_1x64 (m);
3330                 mmx_dest = unpack_32_1x64 (d);
3331
3332                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3333                                                   &mmx_alpha,
3334                                                   &mmx_mask,
3335                                                   &mmx_dest));
3336             }
3337
3338             pd++;
3339             w--;
3340         }
3341
3342         /* call prefetch hint to optimize cache load*/
3343         cache_prefetch ((__m128i*)pd);
3344         cache_prefetch ((__m128i*)pm);
3345
3346         while (w >= 4)
3347         {
3348             /* fill cache line with next memory */
3349             cache_prefetch_next ((__m128i*)pd);
3350             cache_prefetch_next ((__m128i*)pm);
3351
3352             xmm_mask = load_128_unaligned ((__m128i*)pm);
3353
3354             pack_cmp =
3355                 _mm_movemask_epi8 (
3356                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3357
3358             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3359             if (pack_cmp != 0xffff)
3360             {
3361                 xmm_dst = load_128_aligned ((__m128i*)pd);
3362
3363                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3364                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3365
3366                 in_over_2x128 (&xmm_src, &xmm_src,
3367                                &xmm_alpha, &xmm_alpha,
3368                                &xmm_mask_lo, &xmm_mask_hi,
3369                                &xmm_dst_lo, &xmm_dst_hi);
3370
3371                 save_128_aligned (
3372                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3373             }
3374
3375             pd += 4;
3376             pm += 4;
3377             w -= 4;
3378         }
3379
3380         while (w)
3381         {
3382             m = *pm++;
3383
3384             if (m)
3385             {
3386                 d = *pd;
3387                 mmx_mask = unpack_32_1x64 (m);
3388                 mmx_dest = unpack_32_1x64 (d);
3389
3390                 *pd = pack_1x64_32 (
3391                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3392             }
3393
3394             pd++;
3395             w--;
3396         }
3397     }
3398
3399     _mm_empty ();
3400 }
3401
3402 /*---------------------------------------------------------------------
3403  * composite_over_8888_n_8888
3404  */
3405
3406 static void
3407 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3408                                  pixman_op_t              op,
3409                                  pixman_image_t *         src_image,
3410                                  pixman_image_t *         mask_image,
3411                                  pixman_image_t *         dst_image,
3412                                  int32_t                  src_x,
3413                                  int32_t                  src_y,
3414                                  int32_t                  mask_x,
3415                                  int32_t                  mask_y,
3416                                  int32_t                  dest_x,
3417                                  int32_t                  dest_y,
3418                                  int32_t                  width,
3419                                  int32_t                  height)
3420 {
3421     uint32_t    *dst_line, *dst;
3422     uint32_t    *src_line, *src;
3423     uint32_t mask;
3424     int32_t w;
3425     int dst_stride, src_stride;
3426
3427     __m128i xmm_mask;
3428     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3429     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3430     __m128i xmm_alpha_lo, xmm_alpha_hi;
3431
3432     PIXMAN_IMAGE_GET_LINE (
3433         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3434     PIXMAN_IMAGE_GET_LINE (
3435         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3436
3437     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3438
3439     xmm_mask = create_mask_16_128 (mask >> 24);
3440
3441     while (height--)
3442     {
3443         dst = dst_line;
3444         dst_line += dst_stride;
3445         src = src_line;
3446         src_line += src_stride;
3447         w = width;
3448
3449         /* call prefetch hint to optimize cache load*/
3450         cache_prefetch ((__m128i*)dst);
3451         cache_prefetch ((__m128i*)src);
3452
3453         while (w && (unsigned long)dst & 15)
3454         {
3455             uint32_t s = *src++;
3456             uint32_t d = *dst;
3457
3458             __m64 ms = unpack_32_1x64 (s);
3459             __m64 alpha    = expand_alpha_1x64 (ms);
3460             __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3461             __m64 alpha_dst = unpack_32_1x64 (d);
3462
3463             *dst++ = pack_1x64_32 (
3464                 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3465
3466             w--;
3467         }
3468
3469         /* call prefetch hint to optimize cache load*/
3470         cache_prefetch ((__m128i*)dst);
3471         cache_prefetch ((__m128i*)src);
3472
3473         while (w >= 4)
3474         {
3475             /* fill cache line with next memory */
3476             cache_prefetch_next ((__m128i*)dst);
3477             cache_prefetch_next ((__m128i*)src);
3478
3479             xmm_src = load_128_unaligned ((__m128i*)src);
3480             xmm_dst = load_128_aligned ((__m128i*)dst);
3481
3482             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3483             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3484             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3485                                 &xmm_alpha_lo, &xmm_alpha_hi);
3486
3487             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3488                            &xmm_alpha_lo, &xmm_alpha_hi,
3489                            &xmm_mask, &xmm_mask,
3490                            &xmm_dst_lo, &xmm_dst_hi);
3491
3492             save_128_aligned (
3493                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3494
3495             dst += 4;
3496             src += 4;
3497             w -= 4;
3498         }
3499
3500         while (w)
3501         {
3502             uint32_t s = *src++;
3503             uint32_t d = *dst;
3504
3505             __m64 ms = unpack_32_1x64 (s);
3506             __m64 alpha = expand_alpha_1x64 (ms);
3507             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3508             __m64 dest  = unpack_32_1x64 (d);
3509
3510             *dst++ = pack_1x64_32 (
3511                 in_over_1x64 (&ms, &alpha, &mask, &dest));
3512
3513             w--;
3514         }
3515     }
3516
3517     _mm_empty ();
3518 }
3519
3520 /* ---------------------------------------------------------------------
3521  * composite_over_x888_n_8888
3522  */
3523 static void
3524 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3525                                  pixman_op_t              op,
3526                                  pixman_image_t *         src_image,
3527                                  pixman_image_t *         mask_image,
3528                                  pixman_image_t *         dst_image,
3529                                  int32_t                  src_x,
3530                                  int32_t                  src_y,
3531                                  int32_t                  mask_x,
3532                                  int32_t                  mask_y,
3533                                  int32_t                  dest_x,
3534                                  int32_t                  dest_y,
3535                                  int32_t                  width,
3536                                  int32_t                  height)
3537 {
3538     uint32_t    *dst_line, *dst;
3539     uint32_t    *src_line, *src;
3540     uint32_t mask;
3541     int dst_stride, src_stride;
3542     int32_t w;
3543
3544     __m128i xmm_mask, xmm_alpha;
3545     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3546     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3547
3548     PIXMAN_IMAGE_GET_LINE (
3549         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3550     PIXMAN_IMAGE_GET_LINE (
3551         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3552
3553     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3554
3555     xmm_mask = create_mask_16_128 (mask >> 24);
3556     xmm_alpha = mask_00ff;
3557
3558     while (height--)
3559     {
3560         dst = dst_line;
3561         dst_line += dst_stride;
3562         src = src_line;
3563         src_line += src_stride;
3564         w = width;
3565
3566         /* call prefetch hint to optimize cache load*/
3567         cache_prefetch ((__m128i*)dst);
3568         cache_prefetch ((__m128i*)src);
3569
3570         while (w && (unsigned long)dst & 15)
3571         {
3572             uint32_t s = (*src++) | 0xff000000;
3573             uint32_t d = *dst;
3574
3575             __m64 src   = unpack_32_1x64 (s);
3576             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3577             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3578             __m64 dest  = unpack_32_1x64 (d);
3579
3580             *dst++ = pack_1x64_32 (
3581                 in_over_1x64 (&src, &alpha, &mask, &dest));
3582
3583             w--;
3584         }
3585
3586         /* call prefetch hint to optimize cache load*/
3587         cache_prefetch ((__m128i*)dst);
3588         cache_prefetch ((__m128i*)src);
3589
3590         while (w >= 4)
3591         {
3592             /* fill cache line with next memory */
3593             cache_prefetch_next ((__m128i*)dst);
3594             cache_prefetch_next ((__m128i*)src);
3595
3596             xmm_src = _mm_or_si128 (
3597                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3598             xmm_dst = load_128_aligned ((__m128i*)dst);
3599
3600             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3601             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3602
3603             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3604                            &xmm_alpha, &xmm_alpha,
3605                            &xmm_mask, &xmm_mask,
3606                            &xmm_dst_lo, &xmm_dst_hi);
3607
3608             save_128_aligned (
3609                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3610
3611             dst += 4;
3612             src += 4;
3613             w -= 4;
3614
3615         }
3616
3617         while (w)
3618         {
3619             uint32_t s = (*src++) | 0xff000000;
3620             uint32_t d = *dst;
3621
3622             __m64 src  = unpack_32_1x64 (s);
3623             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3624             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3625             __m64 dest  = unpack_32_1x64 (d);
3626
3627             *dst++ = pack_1x64_32 (
3628                 in_over_1x64 (&src, &alpha, &mask, &dest));
3629
3630             w--;
3631         }
3632     }
3633
3634     _mm_empty ();
3635 }
3636
3637 /* --------------------------------------------------------------------
3638  * composite_over_8888_8888
3639  */
3640 static void
3641 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3642                                pixman_op_t              op,
3643                                pixman_image_t *         src_image,
3644                                pixman_image_t *         mask_image,
3645                                pixman_image_t *         dst_image,
3646                                int32_t                  src_x,
3647                                int32_t                  src_y,
3648                                int32_t                  mask_x,
3649                                int32_t                  mask_y,
3650                                int32_t                  dest_x,
3651                                int32_t                  dest_y,
3652                                int32_t                  width,
3653                                int32_t                  height)
3654 {
3655     int dst_stride, src_stride;
3656     uint32_t    *dst_line, *dst;
3657     uint32_t    *src_line, *src;
3658
3659     PIXMAN_IMAGE_GET_LINE (
3660         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3661     PIXMAN_IMAGE_GET_LINE (
3662         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3663
3664     dst = dst_line;
3665     src = src_line;
3666
3667     while (height--)
3668     {
3669         core_combine_over_u_sse2 (dst, src, NULL, width);
3670
3671         dst += dst_stride;
3672         src += src_stride;
3673     }
3674     _mm_empty ();
3675 }
3676
3677 /* ------------------------------------------------------------------
3678  * composite_over_8888_0565
3679  */
3680 static force_inline uint16_t
3681 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3682 {
3683     __m64 ms;
3684
3685     ms = unpack_32_1x64 (src);
3686     return pack_565_32_16 (
3687         pack_1x64_32 (
3688             over_1x64 (
3689                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3690 }
3691
3692 static void
3693 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3694                                pixman_op_t              op,
3695                                pixman_image_t *         src_image,
3696                                pixman_image_t *         mask_image,
3697                                pixman_image_t *         dst_image,
3698                                int32_t                  src_x,
3699                                int32_t                  src_y,
3700                                int32_t                  mask_x,
3701                                int32_t                  mask_y,
3702                                int32_t                  dest_x,
3703                                int32_t                  dest_y,
3704                                int32_t                  width,
3705                                int32_t                  height)
3706 {
3707     uint16_t    *dst_line, *dst, d;
3708     uint32_t    *src_line, *src, s;
3709     int dst_stride, src_stride;
3710     int32_t w;
3711
3712     __m128i xmm_alpha_lo, xmm_alpha_hi;
3713     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3714     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3715
3716     PIXMAN_IMAGE_GET_LINE (
3717         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3718     PIXMAN_IMAGE_GET_LINE (
3719         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3720
3721 #if 0
3722     /* FIXME
3723      *
3724      * I copy the code from MMX one and keep the fixme.
3725      * If it's a problem there, probably is a problem here.
3726      */
3727     assert (src_image->drawable == mask_image->drawable);
3728 #endif
3729
3730     while (height--)
3731     {
3732         dst = dst_line;
3733         src = src_line;
3734
3735         /* call prefetch hint to optimize cache load*/
3736         cache_prefetch ((__m128i*)src);
3737         cache_prefetch ((__m128i*)dst);
3738
3739         dst_line += dst_stride;
3740         src_line += src_stride;
3741         w = width;
3742
3743         /* Align dst on a 16-byte boundary */
3744         while (w &&
3745                ((unsigned long)dst & 15))
3746         {
3747             s = *src++;
3748             d = *dst;
3749
3750             *dst++ = composite_over_8888_0565pixel (s, d);
3751             w--;
3752         }
3753
3754         /* call prefetch hint to optimize cache load*/
3755         cache_prefetch ((__m128i*)src);
3756         cache_prefetch ((__m128i*)dst);
3757
3758         /* It's a 8 pixel loop */
3759         while (w >= 8)
3760         {
3761             /* fill cache line with next memory */
3762             cache_prefetch_next ((__m128i*)src);
3763             cache_prefetch_next ((__m128i*)dst);
3764
3765             /* I'm loading unaligned because I'm not sure
3766              * about the address alignment.
3767              */
3768             xmm_src = load_128_unaligned ((__m128i*) src);
3769             xmm_dst = load_128_aligned ((__m128i*) dst);
3770
3771             /* Unpacking */
3772             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3773             unpack_565_128_4x128 (xmm_dst,
3774                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3775             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3776                                 &xmm_alpha_lo, &xmm_alpha_hi);
3777
3778             /* I'm loading next 4 pixels from memory
3779              * before to optimze the memory read.
3780              */
3781             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3782
3783             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3784                         &xmm_alpha_lo, &xmm_alpha_hi,
3785                         &xmm_dst0, &xmm_dst1);
3786
3787             /* Unpacking */
3788             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3789             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3790                                 &xmm_alpha_lo, &xmm_alpha_hi);
3791
3792             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3793                         &xmm_alpha_lo, &xmm_alpha_hi,
3794                         &xmm_dst2, &xmm_dst3);
3795
3796             save_128_aligned (
3797                 (__m128i*)dst, pack_565_4x128_128 (
3798                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3799
3800             w -= 8;
3801             dst += 8;
3802             src += 8;
3803         }
3804
3805         while (w--)
3806         {
3807             s = *src++;
3808             d = *dst;
3809
3810             *dst++ = composite_over_8888_0565pixel (s, d);
3811         }
3812     }
3813
3814     _mm_empty ();
3815 }
3816
3817 /* -----------------------------------------------------------------
3818  * composite_over_n_8_8888
3819  */
3820
3821 static void
3822 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3823                               pixman_op_t              op,
3824                               pixman_image_t *         src_image,
3825                               pixman_image_t *         mask_image,
3826                               pixman_image_t *         dst_image,
3827                               int32_t                  src_x,
3828                               int32_t                  src_y,
3829                               int32_t                  mask_x,
3830                               int32_t                  mask_y,
3831                               int32_t                  dest_x,
3832                               int32_t                  dest_y,
3833                               int32_t                  width,
3834                               int32_t                  height)
3835 {
3836     uint32_t src, srca;
3837     uint32_t *dst_line, *dst;
3838     uint8_t *mask_line, *mask;
3839     int dst_stride, mask_stride;
3840     int32_t w;
3841     uint32_t m, d;
3842
3843     __m128i xmm_src, xmm_alpha, xmm_def;
3844     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3845     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3846
3847     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3848
3849     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3850
3851     srca = src >> 24;
3852     if (src == 0)
3853         return;
3854
3855     PIXMAN_IMAGE_GET_LINE (
3856         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3857     PIXMAN_IMAGE_GET_LINE (
3858         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3859
3860     xmm_def = create_mask_2x32_128 (src, src);
3861     xmm_src = expand_pixel_32_1x128 (src);
3862     xmm_alpha = expand_alpha_1x128 (xmm_src);
3863     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3864     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3865
3866     while (height--)
3867     {
3868         dst = dst_line;
3869         dst_line += dst_stride;
3870         mask = mask_line;
3871         mask_line += mask_stride;
3872         w = width;
3873
3874         /* call prefetch hint to optimize cache load*/
3875         cache_prefetch ((__m128i*)mask);
3876         cache_prefetch ((__m128i*)dst);
3877
3878         while (w && (unsigned long)dst & 15)
3879         {
3880             uint8_t m = *mask++;
3881
3882             if (m)
3883             {
3884                 d = *dst;
3885                 mmx_mask = expand_pixel_8_1x64 (m);
3886                 mmx_dest = unpack_32_1x64 (d);
3887
3888                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3889                                                    &mmx_alpha,
3890                                                    &mmx_mask,
3891                                                    &mmx_dest));
3892             }
3893
3894             w--;
3895             dst++;
3896         }
3897
3898         /* call prefetch hint to optimize cache load*/
3899         cache_prefetch ((__m128i*)mask);
3900         cache_prefetch ((__m128i*)dst);
3901
3902         while (w >= 4)
3903         {
3904             /* fill cache line with next memory */
3905             cache_prefetch_next ((__m128i*)mask);
3906             cache_prefetch_next ((__m128i*)dst);
3907
3908             m = *((uint32_t*)mask);
3909
3910             if (srca == 0xff && m == 0xffffffff)
3911             {
3912                 save_128_aligned ((__m128i*)dst, xmm_def);
3913             }
3914             else if (m)
3915             {
3916                 xmm_dst = load_128_aligned ((__m128i*) dst);
3917                 xmm_mask = unpack_32_1x128 (m);
3918                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3919
3920                 /* Unpacking */
3921                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3922                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3923
3924                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3925                                         &xmm_mask_lo, &xmm_mask_hi);
3926
3927                 in_over_2x128 (&xmm_src, &xmm_src,
3928                                &xmm_alpha, &xmm_alpha,
3929                                &xmm_mask_lo, &xmm_mask_hi,
3930                                &xmm_dst_lo, &xmm_dst_hi);
3931
3932                 save_128_aligned (
3933                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3934             }
3935
3936             w -= 4;
3937             dst += 4;
3938             mask += 4;
3939         }
3940
3941         while (w)
3942         {
3943             uint8_t m = *mask++;
3944
3945             if (m)
3946             {
3947                 d = *dst;
3948                 mmx_mask = expand_pixel_8_1x64 (m);
3949                 mmx_dest = unpack_32_1x64 (d);
3950
3951                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3952                                                    &mmx_alpha,
3953                                                    &mmx_mask,
3954                                                    &mmx_dest));
3955             }
3956
3957             w--;
3958             dst++;
3959         }
3960     }
3961
3962     _mm_empty ();
3963 }
3964
3965 /* ----------------------------------------------------------------
3966  * composite_over_n_8_8888
3967  */
3968
3969 pixman_bool_t
3970 pixman_fill_sse2 (uint32_t *bits,
3971                   int       stride,
3972                   int       bpp,
3973                   int       x,
3974                   int       y,
3975                   int       width,
3976                   int       height,
3977                   uint32_t  data)
3978 {
3979     uint32_t byte_width;
3980     uint8_t         *byte_line;
3981
3982     __m128i xmm_def;
3983
3984     if (bpp != 16 && bpp != 32)
3985         return FALSE;
3986
3987     if (bpp == 16)
3988     {
3989         stride = stride * (int) sizeof (uint32_t) / 2;
3990         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3991         byte_width = 2 * width;
3992         stride *= 2;
3993         data = (data & 0xffff) * 0x00010001;
3994     }
3995     else
3996     {
3997         stride = stride * (int) sizeof (uint32_t) / 4;
3998         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3999         byte_width = 4 * width;
4000         stride *= 4;
4001     }
4002
4003     cache_prefetch ((__m128i*)byte_line);
4004     xmm_def = create_mask_2x32_128 (data, data);
4005
4006     while (height--)
4007     {
4008         int w;
4009         uint8_t *d = byte_line;
4010         byte_line += stride;
4011         w = byte_width;
4012
4013
4014         cache_prefetch_next ((__m128i*)d);
4015
4016         while (w >= 2 && ((unsigned long)d & 3))
4017         {
4018             *(uint16_t *)d = data;
4019             w -= 2;
4020             d += 2;
4021         }
4022
4023         while (w >= 4 && ((unsigned long)d & 15))
4024         {
4025             *(uint32_t *)d = data;
4026
4027             w -= 4;
4028             d += 4;
4029         }
4030
4031         cache_prefetch_next ((__m128i*)d);
4032
4033         while (w >= 128)
4034         {
4035             cache_prefetch (((__m128i*)d) + 12);
4036
4037             save_128_aligned ((__m128i*)(d),     xmm_def);
4038             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4039             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4040             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4041             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
4042             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
4043             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
4044             save_128_aligned ((__m128i*)(d + 112), xmm_def);
4045
4046             d += 128;
4047             w -= 128;
4048         }
4049
4050         if (w >= 64)
4051         {
4052             cache_prefetch (((__m128i*)d) + 8);
4053
4054             save_128_aligned ((__m128i*)(d),     xmm_def);
4055             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4056             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4057             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4058
4059             d += 64;
4060             w -= 64;
4061         }
4062
4063         cache_prefetch_next ((__m128i*)d);
4064
4065         if (w >= 32)
4066         {
4067             save_128_aligned ((__m128i*)(d),     xmm_def);
4068             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4069
4070             d += 32;
4071             w -= 32;
4072         }
4073
4074         if (w >= 16)
4075         {
4076             save_128_aligned ((__m128i*)(d),     xmm_def);
4077
4078             d += 16;
4079             w -= 16;
4080         }
4081
4082         cache_prefetch_next ((__m128i*)d);
4083
4084         while (w >= 4)
4085         {
4086             *(uint32_t *)d = data;
4087
4088             w -= 4;
4089             d += 4;
4090         }
4091
4092         if (w >= 2)
4093         {
4094             *(uint16_t *)d = data;
4095             w -= 2;
4096             d += 2;
4097         }
4098     }
4099
4100     _mm_empty ();
4101     return TRUE;
4102 }
4103
4104 static void
4105 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4106                              pixman_op_t              op,
4107                              pixman_image_t *         src_image,
4108                              pixman_image_t *         mask_image,
4109                              pixman_image_t *         dst_image,
4110                              int32_t                  src_x,
4111                              int32_t                  src_y,
4112                              int32_t                  mask_x,
4113                              int32_t                  mask_y,
4114                              int32_t                  dest_x,
4115                              int32_t                  dest_y,
4116                              int32_t                  width,
4117                              int32_t                  height)
4118 {
4119     uint32_t src, srca;
4120     uint32_t    *dst_line, *dst;
4121     uint8_t     *mask_line, *mask;
4122     int dst_stride, mask_stride;
4123     int32_t w;
4124     uint32_t m;
4125
4126     __m128i xmm_src, xmm_def;
4127     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4128
4129     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4130
4131     srca = src >> 24;
4132     if (src == 0)
4133     {
4134         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4135                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
4136                           dest_x, dest_y, width, height, 0);
4137         return;
4138     }
4139
4140     PIXMAN_IMAGE_GET_LINE (
4141         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4142     PIXMAN_IMAGE_GET_LINE (
4143         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4144
4145     xmm_def = create_mask_2x32_128 (src, src);
4146     xmm_src = expand_pixel_32_1x128 (src);
4147
4148     while (height--)
4149     {
4150         dst = dst_line;
4151         dst_line += dst_stride;
4152         mask = mask_line;
4153         mask_line += mask_stride;
4154         w = width;
4155
4156         /* call prefetch hint to optimize cache load*/
4157         cache_prefetch ((__m128i*)mask);
4158         cache_prefetch ((__m128i*)dst);
4159
4160         while (w && (unsigned long)dst & 15)
4161         {
4162             uint8_t m = *mask++;
4163
4164             if (m)
4165             {
4166                 *dst = pack_1x64_32 (
4167                     pix_multiply_1x64 (
4168                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4169             }
4170             else
4171             {
4172                 *dst = 0;
4173             }
4174
4175             w--;
4176             dst++;
4177         }
4178
4179         /* call prefetch hint to optimize cache load*/
4180         cache_prefetch ((__m128i*)mask);
4181         cache_prefetch ((__m128i*)dst);
4182
4183         while (w >= 4)
4184         {
4185             /* fill cache line with next memory */
4186             cache_prefetch_next ((__m128i*)mask);
4187             cache_prefetch_next ((__m128i*)dst);
4188
4189             m = *((uint32_t*)mask);
4190
4191             if (srca == 0xff && m == 0xffffffff)
4192             {
4193                 save_128_aligned ((__m128i*)dst, xmm_def);
4194             }
4195             else if (m)
4196             {
4197                 xmm_mask = unpack_32_1x128 (m);
4198                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4199
4200                 /* Unpacking */
4201                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4202
4203                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4204                                         &xmm_mask_lo, &xmm_mask_hi);
4205
4206                 pix_multiply_2x128 (&xmm_src, &xmm_src,
4207                                     &xmm_mask_lo, &xmm_mask_hi,
4208                                     &xmm_mask_lo, &xmm_mask_hi);
4209
4210                 save_128_aligned (
4211                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4212             }
4213             else
4214             {
4215                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4216             }
4217
4218             w -= 4;
4219             dst += 4;
4220             mask += 4;
4221         }
4222
4223         while (w)
4224         {
4225             uint8_t m = *mask++;
4226
4227             if (m)
4228             {
4229                 *dst = pack_1x64_32 (
4230                     pix_multiply_1x64 (
4231                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4232             }
4233             else
4234             {
4235                 *dst = 0;
4236             }
4237
4238             w--;
4239             dst++;
4240         }
4241     }
4242
4243     _mm_empty ();
4244 }
4245
4246 /*-----------------------------------------------------------------------
4247  * composite_over_n_8_0565
4248  */
4249
4250 static void
4251 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4252                               pixman_op_t              op,
4253                               pixman_image_t *         src_image,
4254                               pixman_image_t *         mask_image,
4255                               pixman_image_t *         dst_image,
4256                               int32_t                  src_x,
4257                               int32_t                  src_y,
4258                               int32_t                  mask_x,
4259                               int32_t                  mask_y,
4260                               int32_t                  dest_x,
4261                               int32_t                  dest_y,
4262                               int32_t                  width,
4263                               int32_t                  height)
4264 {
4265     uint32_t src, srca;
4266     uint16_t    *dst_line, *dst, d;
4267     uint8_t     *mask_line, *mask;
4268     int dst_stride, mask_stride;
4269     int32_t w;
4270     uint32_t m;
4271     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4272
4273     __m128i xmm_src, xmm_alpha;
4274     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4275     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4276
4277     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4278
4279     srca = src >> 24;
4280     if (src == 0)
4281         return;
4282
4283     PIXMAN_IMAGE_GET_LINE (
4284         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4285     PIXMAN_IMAGE_GET_LINE (
4286         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4287
4288     xmm_src = expand_pixel_32_1x128 (src);
4289     xmm_alpha = expand_alpha_1x128 (xmm_src);
4290     mmx_src = _mm_movepi64_pi64 (xmm_src);
4291     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4292
4293     while (height--)
4294     {
4295         dst = dst_line;
4296         dst_line += dst_stride;
4297         mask = mask_line;
4298         mask_line += mask_stride;
4299         w = width;
4300
4301         /* call prefetch hint to optimize cache load*/
4302         cache_prefetch ((__m128i*)mask);
4303         cache_prefetch ((__m128i*)dst);
4304
4305         while (w && (unsigned long)dst & 15)
4306         {
4307             m = *mask++;
4308
4309             if (m)
4310             {
4311                 d = *dst;
4312                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4313                 mmx_dest = expand565_16_1x64 (d);
4314
4315                 *dst = pack_565_32_16 (
4316                     pack_1x64_32 (
4317                         in_over_1x64 (
4318                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4319             }
4320
4321             w--;
4322             dst++;
4323         }
4324
4325         /* call prefetch hint to optimize cache load*/
4326         cache_prefetch ((__m128i*)mask);
4327         cache_prefetch ((__m128i*)dst);
4328
4329         while (w >= 8)
4330         {
4331             /* fill cache line with next memory */
4332             cache_prefetch_next ((__m128i*)mask);
4333             cache_prefetch_next ((__m128i*)dst);
4334
4335             xmm_dst = load_128_aligned ((__m128i*) dst);
4336             unpack_565_128_4x128 (xmm_dst,
4337                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4338
4339             m = *((uint32_t*)mask);
4340             mask += 4;
4341
4342             if (m)
4343             {
4344                 xmm_mask = unpack_32_1x128 (m);
4345                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4346
4347                 /* Unpacking */
4348                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4349
4350                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4351                                         &xmm_mask_lo, &xmm_mask_hi);
4352
4353                 in_over_2x128 (&xmm_src, &xmm_src,
4354                                &xmm_alpha, &xmm_alpha,
4355                                &xmm_mask_lo, &xmm_mask_hi,
4356                                &xmm_dst0, &xmm_dst1);
4357             }
4358
4359             m = *((uint32_t*)mask);
4360             mask += 4;
4361
4362             if (m)
4363             {
4364                 xmm_mask = unpack_32_1x128 (m);
4365                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4366
4367                 /* Unpacking */
4368                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4369
4370                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4371                                         &xmm_mask_lo, &xmm_mask_hi);
4372                 in_over_2x128 (&xmm_src, &xmm_src,
4373                                &xmm_alpha, &xmm_alpha,
4374                                &xmm_mask_lo, &xmm_mask_hi,
4375                                &xmm_dst2, &xmm_dst3);
4376             }
4377
4378             save_128_aligned (
4379                 (__m128i*)dst, pack_565_4x128_128 (
4380                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4381
4382             w -= 8;
4383             dst += 8;
4384         }
4385
4386         while (w)
4387         {
4388             m = *mask++;
4389
4390             if (m)
4391             {
4392                 d = *dst;
4393                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4394                 mmx_dest = expand565_16_1x64 (d);
4395
4396                 *dst = pack_565_32_16 (
4397                     pack_1x64_32 (
4398                         in_over_1x64 (
4399                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4400             }
4401
4402             w--;
4403             dst++;
4404         }
4405     }
4406
4407     _mm_empty ();
4408 }
4409
4410 /* -----------------------------------------------------------------------
4411  * composite_over_pixbuf_0565
4412  */
4413
4414 static void
4415 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4416                                  pixman_op_t              op,
4417                                  pixman_image_t *         src_image,
4418                                  pixman_image_t *         mask_image,
4419                                  pixman_image_t *         dst_image,
4420                                  int32_t                  src_x,
4421                                  int32_t                  src_y,
4422                                  int32_t                  mask_x,
4423                                  int32_t                  mask_y,
4424                                  int32_t                  dest_x,
4425                                  int32_t                  dest_y,
4426                                  int32_t                  width,
4427                                  int32_t                  height)
4428 {
4429     uint16_t    *dst_line, *dst, d;
4430     uint32_t    *src_line, *src, s;
4431     int dst_stride, src_stride;
4432     int32_t w;
4433     uint32_t opaque, zero;
4434
4435     __m64 ms;
4436     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4437     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4438
4439     PIXMAN_IMAGE_GET_LINE (
4440         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4441     PIXMAN_IMAGE_GET_LINE (
4442         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4443
4444 #if 0
4445     /* FIXME
4446      *
4447      * I copy the code from MMX one and keep the fixme.
4448      * If it's a problem there, probably is a problem here.
4449      */
4450     assert (src_image->drawable == mask_image->drawable);
4451 #endif
4452
4453     while (height--)
4454     {
4455         dst = dst_line;
4456         dst_line += dst_stride;
4457         src = src_line;
4458         src_line += src_stride;
4459         w = width;
4460
4461         /* call prefetch hint to optimize cache load*/
4462         cache_prefetch ((__m128i*)src);
4463         cache_prefetch ((__m128i*)dst);
4464
4465         while (w && (unsigned long)dst & 15)
4466         {
4467             s = *src++;
4468             d = *dst;
4469
4470             ms = unpack_32_1x64 (s);
4471
4472             *dst++ = pack_565_32_16 (
4473                 pack_1x64_32 (
4474                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4475             w--;
4476         }
4477
4478         /* call prefetch hint to optimize cache load*/
4479         cache_prefetch ((__m128i*)src);
4480         cache_prefetch ((__m128i*)dst);
4481
4482         while (w >= 8)
4483         {
4484             /* fill cache line with next memory */
4485             cache_prefetch_next ((__m128i*)src);
4486             cache_prefetch_next ((__m128i*)dst);
4487
4488             /* First round */
4489             xmm_src = load_128_unaligned ((__m128i*)src);
4490             xmm_dst = load_128_aligned  ((__m128i*)dst);
4491
4492             opaque = is_opaque (xmm_src);
4493             zero = is_zero (xmm_src);
4494
4495             unpack_565_128_4x128 (xmm_dst,
4496                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4497             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4498
4499             /* preload next round*/
4500             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4501
4502             if (opaque)
4503             {
4504                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4505                                      &xmm_dst0, &xmm_dst1);
4506             }
4507             else if (!zero)
4508             {
4509                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4510                                         &xmm_dst0, &xmm_dst1);
4511             }
4512
4513             /* Second round */
4514             opaque = is_opaque (xmm_src);
4515             zero = is_zero (xmm_src);
4516
4517             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4518
4519             if (opaque)
4520             {
4521                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4522                                      &xmm_dst2, &xmm_dst3);
4523             }
4524             else if (!zero)
4525             {
4526                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4527                                         &xmm_dst2, &xmm_dst3);
4528             }
4529
4530             save_128_aligned (
4531                 (__m128i*)dst, pack_565_4x128_128 (
4532                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4533
4534             w -= 8;
4535             src += 8;
4536             dst += 8;
4537         }
4538
4539         while (w)
4540         {
4541             s = *src++;
4542             d = *dst;
4543
4544             ms = unpack_32_1x64 (s);
4545
4546             *dst++ = pack_565_32_16 (
4547                 pack_1x64_32 (
4548                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4549             w--;
4550         }
4551     }
4552
4553     _mm_empty ();
4554 }
4555
4556 /* -------------------------------------------------------------------------
4557  * composite_over_pixbuf_8888
4558  */
4559
4560 static void
4561 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4562                                  pixman_op_t              op,
4563                                  pixman_image_t *         src_image,
4564                                  pixman_image_t *         mask_image,
4565                                  pixman_image_t *         dst_image,
4566                                  int32_t                  src_x,
4567                                  int32_t                  src_y,
4568                                  int32_t                  mask_x,
4569                                  int32_t                  mask_y,
4570                                  int32_t                  dest_x,
4571                                  int32_t                  dest_y,
4572                                  int32_t                  width,
4573                                  int32_t                  height)
4574 {
4575     uint32_t    *dst_line, *dst, d;
4576     uint32_t    *src_line, *src, s;
4577     int dst_stride, src_stride;
4578     int32_t w;
4579     uint32_t opaque, zero;
4580
4581     __m128i xmm_src_lo, xmm_src_hi;
4582     __m128i xmm_dst_lo, xmm_dst_hi;
4583
4584     PIXMAN_IMAGE_GET_LINE (
4585         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4586     PIXMAN_IMAGE_GET_LINE (
4587         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4588
4589 #if 0
4590     /* FIXME
4591      *
4592      * I copy the code from MMX one and keep the fixme.
4593      * If it's a problem there, probably is a problem here.
4594      */
4595     assert (src_image->drawable == mask_image->drawable);
4596 #endif
4597
4598     while (height--)
4599     {
4600         dst = dst_line;
4601         dst_line += dst_stride;
4602         src = src_line;
4603         src_line += src_stride;
4604         w = width;
4605
4606         /* call prefetch hint to optimize cache load*/
4607         cache_prefetch ((__m128i*)src);
4608         cache_prefetch ((__m128i*)dst);
4609
4610         while (w && (unsigned long)dst & 15)
4611         {
4612             s = *src++;
4613             d = *dst;
4614
4615             *dst++ = pack_1x64_32 (
4616                 over_rev_non_pre_1x64 (
4617                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4618
4619             w--;
4620         }
4621
4622         /* call prefetch hint to optimize cache load*/
4623         cache_prefetch ((__m128i*)src);
4624         cache_prefetch ((__m128i*)dst);
4625
4626         while (w >= 4)
4627         {
4628             /* fill cache line with next memory */
4629             cache_prefetch_next ((__m128i*)src);
4630             cache_prefetch_next ((__m128i*)dst);
4631
4632             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4633
4634             opaque = is_opaque (xmm_src_hi);
4635             zero = is_zero (xmm_src_hi);
4636
4637             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4638
4639             if (opaque)
4640             {
4641                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4642                                      &xmm_dst_lo, &xmm_dst_hi);
4643
4644                 save_128_aligned (
4645                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4646             }
4647             else if (!zero)
4648             {
4649                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4650
4651                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4652
4653                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4654                                         &xmm_dst_lo, &xmm_dst_hi);
4655
4656                 save_128_aligned (
4657                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4658             }
4659
4660             w -= 4;
4661             dst += 4;
4662             src += 4;
4663         }
4664
4665         while (w)
4666         {
4667             s = *src++;
4668             d = *dst;
4669
4670             *dst++ = pack_1x64_32 (
4671                 over_rev_non_pre_1x64 (
4672                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4673
4674             w--;
4675         }
4676     }
4677
4678     _mm_empty ();
4679 }
4680
4681 /* -------------------------------------------------------------------------------------------------
4682  * composite_over_n_8888_0565_ca
4683  */
4684
4685 static void
4686 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4687                                     pixman_op_t              op,
4688                                     pixman_image_t *         src_image,
4689                                     pixman_image_t *         mask_image,
4690                                     pixman_image_t *         dst_image,
4691                                     int32_t                  src_x,
4692                                     int32_t                  src_y,
4693                                     int32_t                  mask_x,
4694                                     int32_t                  mask_y,
4695                                     int32_t                  dest_x,
4696                                     int32_t                  dest_y,
4697                                     int32_t                  width,
4698                                     int32_t                  height)
4699 {
4700     uint32_t src;
4701     uint16_t    *dst_line, *dst, d;
4702     uint32_t    *mask_line, *mask, m;
4703     int dst_stride, mask_stride;
4704     int w;
4705     uint32_t pack_cmp;
4706
4707     __m128i xmm_src, xmm_alpha;
4708     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4709     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4710
4711     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4712
4713     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4714
4715     if (src == 0)
4716         return;
4717
4718     PIXMAN_IMAGE_GET_LINE (
4719         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4720     PIXMAN_IMAGE_GET_LINE (
4721         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4722
4723     xmm_src = expand_pixel_32_1x128 (src);
4724     xmm_alpha = expand_alpha_1x128 (xmm_src);
4725     mmx_src = _mm_movepi64_pi64 (xmm_src);
4726     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4727
4728     while (height--)
4729     {
4730         w = width;
4731         mask = mask_line;
4732         dst = dst_line;
4733         mask_line += mask_stride;
4734         dst_line += dst_stride;
4735
4736         /* call prefetch hint to optimize cache load*/
4737         cache_prefetch ((__m128i*)mask);
4738         cache_prefetch ((__m128i*)dst);
4739
4740         while (w && ((unsigned long)dst & 15))
4741         {
4742             m = *(uint32_t *) mask;
4743
4744             if (m)
4745             {
4746                 d = *dst;
4747                 mmx_mask = unpack_32_1x64 (m);
4748                 mmx_dest = expand565_16_1x64 (d);
4749
4750                 *dst = pack_565_32_16 (
4751                     pack_1x64_32 (
4752                         in_over_1x64 (
4753                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4754             }
4755
4756             w--;
4757             dst++;
4758             mask++;
4759         }
4760
4761         /* call prefetch hint to optimize cache load*/
4762         cache_prefetch ((__m128i*)mask);
4763         cache_prefetch ((__m128i*)dst);
4764
4765         while (w >= 8)
4766         {
4767             /* fill cache line with next memory */
4768             cache_prefetch_next ((__m128i*)mask);
4769             cache_prefetch_next ((__m128i*)dst);
4770
4771             /* First round */
4772             xmm_mask = load_128_unaligned ((__m128i*)mask);
4773             xmm_dst = load_128_aligned ((__m128i*)dst);
4774
4775             pack_cmp = _mm_movemask_epi8 (
4776                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4777
4778             unpack_565_128_4x128 (xmm_dst,
4779                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4780             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4781
4782             /* preload next round */
4783             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4784
4785             /* preload next round */
4786             if (pack_cmp != 0xffff)
4787             {
4788                 in_over_2x128 (&xmm_src, &xmm_src,
4789                                &xmm_alpha, &xmm_alpha,
4790                                &xmm_mask_lo, &xmm_mask_hi,
4791                                &xmm_dst0, &xmm_dst1);
4792             }
4793
4794             /* Second round */
4795             pack_cmp = _mm_movemask_epi8 (
4796                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4797
4798             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4799
4800             if (pack_cmp != 0xffff)
4801             {
4802                 in_over_2x128 (&xmm_src, &xmm_src,
4803                                &xmm_alpha, &xmm_alpha,
4804                                &xmm_mask_lo, &xmm_mask_hi,
4805                                &xmm_dst2, &xmm_dst3);
4806             }
4807
4808             save_128_aligned (
4809                 (__m128i*)dst, pack_565_4x128_128 (
4810                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4811
4812             w -= 8;
4813             dst += 8;
4814             mask += 8;
4815         }
4816
4817         while (w)
4818         {
4819             m = *(uint32_t *) mask;
4820
4821             if (m)
4822             {
4823                 d = *dst;
4824                 mmx_mask = unpack_32_1x64 (m);
4825                 mmx_dest = expand565_16_1x64 (d);
4826
4827                 *dst = pack_565_32_16 (
4828                     pack_1x64_32 (
4829                         in_over_1x64 (
4830                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4831             }
4832
4833             w--;
4834             dst++;
4835             mask++;
4836         }
4837     }
4838
4839     _mm_empty ();
4840 }
4841
4842 /* -----------------------------------------------------------------------
4843  * composite_in_n_8_8
4844  */
4845
4846 static void
4847 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4848                          pixman_op_t              op,
4849                          pixman_image_t *         src_image,
4850                          pixman_image_t *         mask_image,
4851                          pixman_image_t *         dst_image,
4852                          int32_t                  src_x,
4853                          int32_t                  src_y,
4854                          int32_t                  mask_x,
4855                          int32_t                  mask_y,
4856                          int32_t                  dest_x,
4857                          int32_t                  dest_y,
4858                          int32_t                  width,
4859                          int32_t                  height)
4860 {
4861     uint8_t     *dst_line, *dst;
4862     uint8_t     *mask_line, *mask;
4863     int dst_stride, mask_stride;
4864     uint32_t d, m;
4865     uint32_t src;
4866     uint8_t sa;
4867     int32_t w;
4868
4869     __m128i xmm_alpha;
4870     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4871     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4872
4873     PIXMAN_IMAGE_GET_LINE (
4874         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4875     PIXMAN_IMAGE_GET_LINE (
4876         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4877
4878     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4879
4880     sa = src >> 24;
4881
4882     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4883
4884     while (height--)
4885     {
4886         dst = dst_line;
4887         dst_line += dst_stride;
4888         mask = mask_line;
4889         mask_line += mask_stride;
4890         w = width;
4891
4892         /* call prefetch hint to optimize cache load*/
4893         cache_prefetch ((__m128i*)mask);
4894         cache_prefetch ((__m128i*)dst);
4895
4896         while (w && ((unsigned long)dst & 15))
4897         {
4898             m = (uint32_t) *mask++;
4899             d = (uint32_t) *dst;
4900
4901             *dst++ = (uint8_t) pack_1x64_32 (
4902                 pix_multiply_1x64 (
4903                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4904                                        unpack_32_1x64 (m)),
4905                     unpack_32_1x64 (d)));
4906             w--;
4907         }
4908
4909         /* call prefetch hint to optimize cache load*/
4910         cache_prefetch ((__m128i*)mask);
4911         cache_prefetch ((__m128i*)dst);
4912
4913         while (w >= 16)
4914         {
4915             /* fill cache line with next memory */
4916             cache_prefetch_next ((__m128i*)mask);
4917             cache_prefetch_next ((__m128i*)dst);
4918
4919             xmm_mask = load_128_unaligned ((__m128i*)mask);
4920             xmm_dst = load_128_aligned ((__m128i*)dst);
4921
4922             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4923             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4924
4925             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4926                                 &xmm_mask_lo, &xmm_mask_hi,
4927                                 &xmm_mask_lo, &xmm_mask_hi);
4928
4929             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4930                                 &xmm_dst_lo, &xmm_dst_hi,
4931                                 &xmm_dst_lo, &xmm_dst_hi);
4932
4933             save_128_aligned (
4934                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4935
4936             mask += 16;
4937             dst += 16;
4938             w -= 16;
4939         }
4940
4941         while (w)
4942         {
4943             m = (uint32_t) *mask++;
4944             d = (uint32_t) *dst;
4945
4946             *dst++ = (uint8_t) pack_1x64_32 (
4947                 pix_multiply_1x64 (
4948                     pix_multiply_1x64 (
4949                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4950                     unpack_32_1x64 (d)));
4951             w--;
4952         }
4953     }
4954
4955     _mm_empty ();
4956 }
4957
4958 /* ---------------------------------------------------------------------------
4959  * composite_in_8_8
4960  */
4961
4962 static void
4963 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4964                        pixman_op_t              op,
4965                        pixman_image_t *         src_image,
4966                        pixman_image_t *         mask_image,
4967                        pixman_image_t *         dst_image,
4968                        int32_t                  src_x,
4969                        int32_t                  src_y,
4970                        int32_t                  mask_x,
4971                        int32_t                  mask_y,
4972                        int32_t                  dest_x,
4973                        int32_t                  dest_y,
4974                        int32_t                  width,
4975                        int32_t                  height)
4976 {
4977     uint8_t     *dst_line, *dst;
4978     uint8_t     *src_line, *src;
4979     int src_stride, dst_stride;
4980     int32_t w;
4981     uint32_t s, d;
4982
4983     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4984     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4985
4986     PIXMAN_IMAGE_GET_LINE (
4987         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4988     PIXMAN_IMAGE_GET_LINE (
4989         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4990
4991     while (height--)
4992     {
4993         dst = dst_line;
4994         dst_line += dst_stride;
4995         src = src_line;
4996         src_line += src_stride;
4997         w = width;
4998
4999         /* call prefetch hint to optimize cache load*/
5000         cache_prefetch ((__m128i*)src);
5001         cache_prefetch ((__m128i*)dst);
5002
5003         while (w && ((unsigned long)dst & 15))
5004         {
5005             s = (uint32_t) *src++;
5006             d = (uint32_t) *dst;
5007
5008             *dst++ = (uint8_t) pack_1x64_32 (
5009                 pix_multiply_1x64 (
5010                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
5011             w--;
5012         }
5013
5014         /* call prefetch hint to optimize cache load*/
5015         cache_prefetch ((__m128i*)src);
5016         cache_prefetch ((__m128i*)dst);
5017
5018         while (w >= 16)
5019         {
5020             /* fill cache line with next memory */
5021             cache_prefetch_next ((__m128i*)src);
5022             cache_prefetch_next ((__m128i*)dst);
5023
5024             xmm_src = load_128_unaligned ((__m128i*)src);
5025             xmm_dst = load_128_aligned ((__m128i*)dst);
5026
5027             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5028             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5029
5030             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5031                                 &xmm_dst_lo, &xmm_dst_hi,
5032                                 &xmm_dst_lo, &xmm_dst_hi);
5033
5034             save_128_aligned (
5035                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5036
5037             src += 16;
5038             dst += 16;
5039             w -= 16;
5040         }
5041
5042         while (w)
5043         {
5044             s = (uint32_t) *src++;
5045             d = (uint32_t) *dst;
5046
5047             *dst++ = (uint8_t) pack_1x64_32 (
5048                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5049             w--;
5050         }
5051     }
5052
5053     _mm_empty ();
5054 }
5055
5056 /* -------------------------------------------------------------------------
5057  * composite_add_n_8_8
5058  */
5059
5060 static void
5061 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
5062                           pixman_op_t              op,
5063                           pixman_image_t *         src_image,
5064                           pixman_image_t *         mask_image,
5065                           pixman_image_t *         dst_image,
5066                           int32_t                  src_x,
5067                           int32_t                  src_y,
5068                           int32_t                  mask_x,
5069                           int32_t                  mask_y,
5070                           int32_t                  dest_x,
5071                           int32_t                  dest_y,
5072                           int32_t                  width,
5073                           int32_t                  height)
5074 {
5075     uint8_t     *dst_line, *dst;
5076     uint8_t     *mask_line, *mask;
5077     int dst_stride, mask_stride;
5078     int32_t w;
5079     uint32_t src;
5080     uint8_t sa;
5081     uint32_t m, d;
5082
5083     __m128i xmm_alpha;
5084     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5085     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5086
5087     PIXMAN_IMAGE_GET_LINE (
5088         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5089     PIXMAN_IMAGE_GET_LINE (
5090         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5091
5092     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5093
5094     sa = src >> 24;
5095
5096     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5097
5098     while (height--)
5099     {
5100         dst = dst_line;
5101         dst_line += dst_stride;
5102         mask = mask_line;
5103         mask_line += mask_stride;
5104         w = width;
5105
5106         /* call prefetch hint to optimize cache load*/
5107         cache_prefetch ((__m128i*)mask);
5108         cache_prefetch ((__m128i*)dst);
5109
5110         while (w && ((unsigned long)dst & 15))
5111         {
5112             m = (uint32_t) *mask++;
5113             d = (uint32_t) *dst;
5114
5115             *dst++ = (uint8_t) pack_1x64_32 (
5116                 _mm_adds_pu16 (
5117                     pix_multiply_1x64 (
5118                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5119                     unpack_32_1x64 (d)));
5120             w--;
5121         }
5122
5123         /* call prefetch hint to optimize cache load*/
5124         cache_prefetch ((__m128i*)mask);
5125         cache_prefetch ((__m128i*)dst);
5126
5127         while (w >= 16)
5128         {
5129             /* fill cache line with next memory */
5130             cache_prefetch_next ((__m128i*)mask);
5131             cache_prefetch_next ((__m128i*)dst);
5132
5133             xmm_mask = load_128_unaligned ((__m128i*)mask);
5134             xmm_dst = load_128_aligned ((__m128i*)dst);
5135
5136             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5137             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5138
5139             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5140                                 &xmm_mask_lo, &xmm_mask_hi,
5141                                 &xmm_mask_lo, &xmm_mask_hi);
5142
5143             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5144             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5145
5146             save_128_aligned (
5147                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5148
5149             mask += 16;
5150             dst += 16;
5151             w -= 16;
5152         }
5153
5154         while (w)
5155         {
5156             m = (uint32_t) *mask++;
5157             d = (uint32_t) *dst;
5158
5159             *dst++ = (uint8_t) pack_1x64_32 (
5160                 _mm_adds_pu16 (
5161                     pix_multiply_1x64 (
5162                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5163                     unpack_32_1x64 (d)));
5164
5165             w--;
5166         }
5167     }
5168
5169     _mm_empty ();
5170 }
5171
5172 /* ----------------------------------------------------------------------
5173  * composite_add_8000_8000
5174  */
5175
5176 static void
5177 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5178                               pixman_op_t              op,
5179                               pixman_image_t *         src_image,
5180                               pixman_image_t *         mask_image,
5181                               pixman_image_t *         dst_image,
5182                               int32_t                  src_x,
5183                               int32_t                  src_y,
5184                               int32_t                  mask_x,
5185                               int32_t                  mask_y,
5186                               int32_t                  dest_x,
5187                               int32_t                  dest_y,
5188                               int32_t                  width,
5189                               int32_t                  height)
5190 {
5191     uint8_t     *dst_line, *dst;
5192     uint8_t     *src_line, *src;
5193     int dst_stride, src_stride;
5194     int32_t w;
5195     uint16_t t;
5196
5197     PIXMAN_IMAGE_GET_LINE (
5198         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5199     PIXMAN_IMAGE_GET_LINE (
5200         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5201
5202     while (height--)
5203     {
5204         dst = dst_line;
5205         src = src_line;
5206
5207         /* call prefetch hint to optimize cache load*/
5208         cache_prefetch ((__m128i*)src);
5209         cache_prefetch ((__m128i*)dst);
5210
5211         dst_line += dst_stride;
5212         src_line += src_stride;
5213         w = width;
5214
5215         /* Small head */
5216         while (w && (unsigned long)dst & 3)
5217         {
5218             t = (*dst) + (*src++);
5219             *dst++ = t | (0 - (t >> 8));
5220             w--;
5221         }
5222
5223         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5224
5225         /* Small tail */
5226         dst += w & 0xfffc;
5227         src += w & 0xfffc;
5228
5229         w &= 3;
5230
5231         while (w)
5232         {
5233             t = (*dst) + (*src++);
5234             *dst++ = t | (0 - (t >> 8));
5235             w--;
5236         }
5237     }
5238
5239     _mm_empty ();
5240 }
5241
5242 /* ---------------------------------------------------------------------
5243  * composite_add_8888_8888
5244  */
5245 static void
5246 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5247                               pixman_op_t              op,
5248                               pixman_image_t *         src_image,
5249                               pixman_image_t *         mask_image,
5250                               pixman_image_t *         dst_image,
5251                               int32_t                  src_x,
5252                               int32_t                  src_y,
5253                               int32_t                  mask_x,
5254                               int32_t                  mask_y,
5255                               int32_t                  dest_x,
5256                               int32_t                  dest_y,
5257                               int32_t                  width,
5258                               int32_t                  height)
5259 {
5260     uint32_t    *dst_line, *dst;
5261     uint32_t    *src_line, *src;
5262     int dst_stride, src_stride;
5263
5264     PIXMAN_IMAGE_GET_LINE (
5265         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5266     PIXMAN_IMAGE_GET_LINE (
5267         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5268
5269     while (height--)
5270     {
5271         dst = dst_line;
5272         dst_line += dst_stride;
5273         src = src_line;
5274         src_line += src_stride;
5275
5276         core_combine_add_u_sse2 (dst, src, NULL, width);
5277     }
5278
5279     _mm_empty ();
5280 }
5281
5282 /* -------------------------------------------------------------------------------------------------
5283  * sse2_composite_copy_area
5284  */
5285
5286 static pixman_bool_t
5287 pixman_blt_sse2 (uint32_t *src_bits,
5288                  uint32_t *dst_bits,
5289                  int       src_stride,
5290                  int       dst_stride,
5291                  int       src_bpp,
5292                  int       dst_bpp,
5293                  int       src_x,
5294                  int       src_y,
5295                  int       dst_x,
5296                  int       dst_y,
5297                  int       width,
5298                  int       height)
5299 {
5300     uint8_t *   src_bytes;
5301     uint8_t *   dst_bytes;
5302     int byte_width;
5303
5304     if (src_bpp != dst_bpp)
5305         return FALSE;
5306
5307     if (src_bpp == 16)
5308     {
5309         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5310         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5311         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5312         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5313         byte_width = 2 * width;
5314         src_stride *= 2;
5315         dst_stride *= 2;
5316     }
5317     else if (src_bpp == 32)
5318     {
5319         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5320         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5321         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5322         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5323         byte_width = 4 * width;
5324         src_stride *= 4;
5325         dst_stride *= 4;
5326     }
5327     else
5328     {
5329         return FALSE;
5330     }
5331
5332     cache_prefetch ((__m128i*)src_bytes);
5333     cache_prefetch ((__m128i*)dst_bytes);
5334
5335     while (height--)
5336     {
5337         int w;
5338         uint8_t *s = src_bytes;
5339         uint8_t *d = dst_bytes;
5340         src_bytes += src_stride;
5341         dst_bytes += dst_stride;
5342         w = byte_width;
5343
5344         cache_prefetch_next ((__m128i*)s);
5345         cache_prefetch_next ((__m128i*)d);
5346
5347         while (w >= 2 && ((unsigned long)d & 3))
5348         {
5349             *(uint16_t *)d = *(uint16_t *)s;
5350             w -= 2;
5351             s += 2;
5352             d += 2;
5353         }
5354
5355         while (w >= 4 && ((unsigned long)d & 15))
5356         {
5357             *(uint32_t *)d = *(uint32_t *)s;
5358
5359             w -= 4;
5360             s += 4;
5361             d += 4;
5362         }
5363
5364         cache_prefetch_next ((__m128i*)s);
5365         cache_prefetch_next ((__m128i*)d);
5366
5367         while (w >= 64)
5368         {
5369             __m128i xmm0, xmm1, xmm2, xmm3;
5370
5371             /* 128 bytes ahead */
5372             cache_prefetch (((__m128i*)s) + 8);
5373             cache_prefetch (((__m128i*)d) + 8);
5374
5375             xmm0 = load_128_unaligned ((__m128i*)(s));
5376             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5377             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5378             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5379
5380             save_128_aligned ((__m128i*)(d),    xmm0);
5381             save_128_aligned ((__m128i*)(d + 16), xmm1);
5382             save_128_aligned ((__m128i*)(d + 32), xmm2);
5383             save_128_aligned ((__m128i*)(d + 48), xmm3);
5384
5385             s += 64;
5386             d += 64;
5387             w -= 64;
5388         }
5389
5390         cache_prefetch_next ((__m128i*)s);
5391         cache_prefetch_next ((__m128i*)d);
5392
5393         while (w >= 16)
5394         {
5395             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5396
5397             w -= 16;
5398             d += 16;
5399             s += 16;
5400         }
5401
5402         cache_prefetch_next ((__m128i*)s);
5403         cache_prefetch_next ((__m128i*)d);
5404
5405         while (w >= 4)
5406         {
5407             *(uint32_t *)d = *(uint32_t *)s;
5408
5409             w -= 4;
5410             s += 4;
5411             d += 4;
5412         }
5413
5414         if (w >= 2)
5415         {
5416             *(uint16_t *)d = *(uint16_t *)s;
5417             w -= 2;
5418             s += 2;
5419             d += 2;
5420         }
5421     }
5422
5423     _mm_empty ();
5424
5425     return TRUE;
5426 }
5427
5428 static void
5429 sse2_composite_copy_area (pixman_implementation_t *imp,
5430                           pixman_op_t              op,
5431                           pixman_image_t *         src_image,
5432                           pixman_image_t *         mask_image,
5433                           pixman_image_t *         dst_image,
5434                           int32_t                  src_x,
5435                           int32_t                  src_y,
5436                           int32_t                  mask_x,
5437                           int32_t                  mask_y,
5438                           int32_t                  dest_x,
5439                           int32_t                  dest_y,
5440                           int32_t                  width,
5441                           int32_t                  height)
5442 {
5443     pixman_blt_sse2 (src_image->bits.bits,
5444                      dst_image->bits.bits,
5445                      src_image->bits.rowstride,
5446                      dst_image->bits.rowstride,
5447                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5448                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5449                      src_x, src_y, dest_x, dest_y, width, height);
5450 }
5451
5452 static void
5453 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5454                                  pixman_op_t              op,
5455                                  pixman_image_t *         src_image,
5456                                  pixman_image_t *         mask_image,
5457                                  pixman_image_t *         dst_image,
5458                                  int32_t                  src_x,
5459                                  int32_t                  src_y,
5460                                  int32_t                  mask_x,
5461                                  int32_t                  mask_y,
5462                                  int32_t                  dest_x,
5463                                  int32_t                  dest_y,
5464                                  int32_t                  width,
5465                                  int32_t                  height)
5466 {
5467     uint32_t    *src, *src_line, s;
5468     uint32_t    *dst, *dst_line, d;
5469     uint8_t         *mask, *mask_line;
5470     uint32_t m;
5471     int src_stride, mask_stride, dst_stride;
5472     int32_t w;
5473     __m64 ms;
5474
5475     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5476     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5477     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5478
5479     PIXMAN_IMAGE_GET_LINE (
5480         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5481     PIXMAN_IMAGE_GET_LINE (
5482         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5483     PIXMAN_IMAGE_GET_LINE (
5484         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5485
5486     while (height--)
5487     {
5488         src = src_line;
5489         src_line += src_stride;
5490         dst = dst_line;
5491         dst_line += dst_stride;
5492         mask = mask_line;
5493         mask_line += mask_stride;
5494
5495         w = width;
5496
5497         /* call prefetch hint to optimize cache load*/
5498         cache_prefetch ((__m128i*)src);
5499         cache_prefetch ((__m128i*)dst);
5500         cache_prefetch ((__m128i*)mask);
5501
5502         while (w && (unsigned long)dst & 15)
5503         {
5504             s = 0xff000000 | *src++;
5505             m = (uint32_t) *mask++;
5506             d = *dst;
5507             ms = unpack_32_1x64 (s);
5508
5509             if (m != 0xff)
5510             {
5511                 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5512                 __m64 md = unpack_32_1x64 (d);
5513
5514                 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5515             }
5516
5517             *dst++ = pack_1x64_32 (ms);
5518             w--;
5519         }
5520
5521         /* call prefetch hint to optimize cache load*/
5522         cache_prefetch ((__m128i*)src);
5523         cache_prefetch ((__m128i*)dst);
5524         cache_prefetch ((__m128i*)mask);
5525
5526         while (w >= 4)
5527         {
5528             /* fill cache line with next memory */
5529             cache_prefetch_next ((__m128i*)src);
5530             cache_prefetch_next ((__m128i*)dst);
5531             cache_prefetch_next ((__m128i*)mask);
5532
5533             m = *(uint32_t*) mask;
5534             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5535
5536             if (m == 0xffffffff)
5537             {
5538                 save_128_aligned ((__m128i*)dst, xmm_src);
5539             }
5540             else
5541             {
5542                 xmm_dst = load_128_aligned ((__m128i*)dst);
5543
5544                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5545
5546                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5547                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5548                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5549
5550                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5551
5552                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5553
5554                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5555             }
5556
5557             src += 4;
5558             dst += 4;
5559             mask += 4;
5560             w -= 4;
5561         }
5562
5563         while (w)
5564         {
5565             m = (uint32_t) *mask++;
5566
5567             if (m)
5568             {
5569                 s = 0xff000000 | *src;
5570
5571                 if (m == 0xff)
5572                 {
5573                     *dst = s;
5574                 }
5575                 else
5576                 {
5577                     __m64 ma, md, ms;
5578
5579                     d = *dst;
5580
5581                     ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5582                     md = unpack_32_1x64 (d);
5583                     ms = unpack_32_1x64 (s);
5584
5585                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5586                 }
5587
5588             }
5589
5590             src++;
5591             dst++;
5592             w--;
5593         }
5594     }
5595
5596     _mm_empty ();
5597 }
5598
5599 static void
5600 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5601                                  pixman_op_t              op,
5602                                  pixman_image_t *         src_image,
5603                                  pixman_image_t *         mask_image,
5604                                  pixman_image_t *         dst_image,
5605                                  int32_t                  src_x,
5606                                  int32_t                  src_y,
5607                                  int32_t                  mask_x,
5608                                  int32_t                  mask_y,
5609                                  int32_t                  dest_x,
5610                                  int32_t                  dest_y,
5611                                  int32_t                  width,
5612                                  int32_t                  height)
5613 {
5614     uint32_t    *src, *src_line, s;
5615     uint32_t    *dst, *dst_line, d;
5616     uint8_t         *mask, *mask_line;
5617     uint32_t m;
5618     int src_stride, mask_stride, dst_stride;
5619     int32_t w;
5620
5621     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5622     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5623     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5624
5625     PIXMAN_IMAGE_GET_LINE (
5626         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5627     PIXMAN_IMAGE_GET_LINE (
5628         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5629     PIXMAN_IMAGE_GET_LINE (
5630         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5631
5632     while (height--)
5633     {
5634         src = src_line;
5635         src_line += src_stride;
5636         dst = dst_line;
5637         dst_line += dst_stride;
5638         mask = mask_line;
5639         mask_line += mask_stride;
5640
5641         w = width;
5642
5643         /* call prefetch hint to optimize cache load*/
5644         cache_prefetch ((__m128i *)src);
5645         cache_prefetch ((__m128i *)dst);
5646         cache_prefetch ((__m128i *)mask);
5647
5648         while (w && (unsigned long)dst & 15)
5649         {
5650             uint32_t sa;
5651
5652             s = *src++;
5653             m = (uint32_t) *mask++;
5654             d = *dst;
5655
5656             sa = s >> 24;
5657
5658             if (m)
5659             {
5660                 if (sa == 0xff && m == 0xff)
5661                 {
5662                     *dst = s;
5663                 }
5664                 else
5665                 {
5666                     __m64 ms, md, ma, msa;
5667
5668                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5669                     ms = unpack_32_1x64 (s);
5670                     md = unpack_32_1x64 (d);
5671
5672                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5673
5674                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5675                 }
5676             }
5677
5678             dst++;
5679             w--;
5680         }
5681
5682         /* call prefetch hint to optimize cache load*/
5683         cache_prefetch ((__m128i *)src);
5684         cache_prefetch ((__m128i *)dst);
5685         cache_prefetch ((__m128i *)mask);
5686
5687         while (w >= 4)
5688         {
5689             /* fill cache line with next memory */
5690             cache_prefetch_next ((__m128i *)src);
5691             cache_prefetch_next ((__m128i *)dst);
5692             cache_prefetch_next ((__m128i *)mask);
5693
5694             m = *(uint32_t *) mask;
5695
5696             if (m)
5697             {
5698                 xmm_src = load_128_unaligned ((__m128i*)src);
5699
5700                 if (m == 0xffffffff && is_opaque (xmm_src))
5701                 {
5702                     save_128_aligned ((__m128i *)dst, xmm_src);
5703                 }
5704                 else
5705                 {
5706                     xmm_dst = load_128_aligned ((__m128i *)dst);
5707
5708                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5709
5710                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5711                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5712                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5713
5714                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5715                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5716
5717                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5718                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5719
5720                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5721                 }
5722             }
5723
5724             src += 4;
5725             dst += 4;
5726             mask += 4;
5727             w -= 4;
5728         }
5729
5730         while (w)
5731         {
5732             uint32_t sa;
5733
5734             s = *src++;
5735             m = (uint32_t) *mask++;
5736             d = *dst;
5737
5738             sa = s >> 24;
5739
5740             if (m)
5741             {
5742                 if (sa == 0xff && m == 0xff)
5743                 {
5744                     *dst = s;
5745                 }
5746                 else
5747                 {
5748                     __m64 ms, md, ma, msa;
5749
5750                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5751                     ms = unpack_32_1x64 (s);
5752                     md = unpack_32_1x64 (d);
5753
5754                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5755
5756                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5757                 }
5758             }
5759
5760             dst++;
5761             w--;
5762         }
5763     }
5764
5765     _mm_empty ();
5766 }
5767
5768 static const pixman_fast_path_t sse2_fast_paths[] =
5769 {
5770     /* PIXMAN_OP_OVER */
5771     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5772     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5773     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5774     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5775     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5776     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5777     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5778     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5779     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5780     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5781     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5782     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5783     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5784     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5785     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5786     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5787     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5788     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5789     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5790     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5791     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5792     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5793     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5794     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5795     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5796     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5797     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5798     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5799     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5800     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5801     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5802     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5803     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5804     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5805     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5806     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5807     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5808     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5809     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5810     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5811     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5812     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5813     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5814     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5815     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5816
5817     /* PIXMAN_OP_ADD */
5818     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5819     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
5820     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5821     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5822     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5823
5824     /* PIXMAN_OP_SRC */
5825     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5826     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5827     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5828     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5829     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5830     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5831     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5832     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5833     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5834     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5835     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5836     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5837
5838     /* PIXMAN_OP_IN */
5839     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5840     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5841
5842     { PIXMAN_OP_NONE },
5843 };
5844
5845 static pixman_bool_t
5846 sse2_blt (pixman_implementation_t *imp,
5847           uint32_t *               src_bits,
5848           uint32_t *               dst_bits,
5849           int                      src_stride,
5850           int                      dst_stride,
5851           int                      src_bpp,
5852           int                      dst_bpp,
5853           int                      src_x,
5854           int                      src_y,
5855           int                      dst_x,
5856           int                      dst_y,
5857           int                      width,
5858           int                      height)
5859 {
5860     if (!pixman_blt_sse2 (
5861             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5862             src_x, src_y, dst_x, dst_y, width, height))
5863
5864     {
5865         return _pixman_implementation_blt (
5866             imp->delegate,
5867             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5868             src_x, src_y, dst_x, dst_y, width, height);
5869     }
5870
5871     return TRUE;
5872 }
5873
5874 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5875 __attribute__((__force_align_arg_pointer__))
5876 #endif
5877 static pixman_bool_t
5878 sse2_fill (pixman_implementation_t *imp,
5879            uint32_t *               bits,
5880            int                      stride,
5881            int                      bpp,
5882            int                      x,
5883            int                      y,
5884            int                      width,
5885            int                      height,
5886            uint32_t xor)
5887 {
5888     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5889     {
5890         return _pixman_implementation_fill (
5891             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5892     }
5893
5894     return TRUE;
5895 }
5896
5897 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5898 __attribute__((__force_align_arg_pointer__))
5899 #endif
5900 pixman_implementation_t *
5901 _pixman_implementation_create_sse2 (void)
5902 {
5903 #ifdef USE_MMX
5904     pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
5905 #else
5906     pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
5907 #endif
5908     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
5909
5910     /* SSE2 constants */
5911     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5912     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5913     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5914     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5915     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5916     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5917     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5918     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5919     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
5920     mask_0080 = create_mask_16_128 (0x0080);
5921     mask_00ff = create_mask_16_128 (0x00ff);
5922     mask_0101 = create_mask_16_128 (0x0101);
5923     mask_ffff = create_mask_16_128 (0xffff);
5924     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5925     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5926
5927     /* MMX constants */
5928     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5929     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5930
5931     mask_x0080 = create_mask_16_64 (0x0080);
5932     mask_x00ff = create_mask_16_64 (0x00ff);
5933     mask_x0101 = create_mask_16_64 (0x0101);
5934     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5935
5936     _mm_empty ();
5937
5938     /* Set up function pointers */
5939
5940     /* SSE code patch for fbcompose.c */
5941     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5942     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5943     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5944     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5945     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5946     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5947     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5948     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5949     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5950     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5951
5952     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5953
5954     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5955     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5956     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5957     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5958     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5959     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5960     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5961     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5962     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5963     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5964     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5965
5966     imp->blt = sse2_blt;
5967     imp->fill = sse2_fill;
5968
5969     return imp;
5970 }
5971
5972 #endif /* USE_SSE2 */