Remove cache prefetch code.
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38 #include "pixman-fast-path.h"
39
40 #if defined(_MSC_VER) && defined(_M_AMD64)
41 /* Windows 64 doesn't allow MMX to be used, so
42  * the pixman-x64-mmx-emulation.h file contains
43  * implementations of those MMX intrinsics that
44  * are used in the SSE2 implementation.
45  */
46 #   include "pixman-x64-mmx-emulation.h"
47 #endif
48
49 #ifdef USE_SSE2
50
51 /* --------------------------------------------------------------------
52  * Locals
53  */
54
55 static __m64 mask_x0080;
56 static __m64 mask_x00ff;
57 static __m64 mask_x0101;
58 static __m64 mask_x_alpha;
59
60 static __m64 mask_x565_rgb;
61 static __m64 mask_x565_unpack;
62
63 static __m128i mask_0080;
64 static __m128i mask_00ff;
65 static __m128i mask_0101;
66 static __m128i mask_ffff;
67 static __m128i mask_ff000000;
68 static __m128i mask_alpha;
69
70 static __m128i mask_565_r;
71 static __m128i mask_565_g1, mask_565_g2;
72 static __m128i mask_565_b;
73 static __m128i mask_red;
74 static __m128i mask_green;
75 static __m128i mask_blue;
76
77 static __m128i mask_565_fix_rb;
78 static __m128i mask_565_fix_g;
79
80 /* ----------------------------------------------------------------------
81  * SSE2 Inlines
82  */
83 static force_inline __m128i
84 unpack_32_1x128 (uint32_t data)
85 {
86     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
87 }
88
89 static force_inline void
90 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
91 {
92     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
93     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
94 }
95
96 static force_inline __m128i
97 unpack_565_to_8888 (__m128i lo)
98 {
99     __m128i r, g, b, rb, t;
100
101     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
102     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
103     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
104
105     rb = _mm_or_si128 (r, b);
106     t  = _mm_and_si128 (rb, mask_565_fix_rb);
107     t  = _mm_srli_epi32 (t, 5);
108     rb = _mm_or_si128 (rb, t);
109
110     t  = _mm_and_si128 (g, mask_565_fix_g);
111     t  = _mm_srli_epi32 (t, 6);
112     g  = _mm_or_si128 (g, t);
113
114     return _mm_or_si128 (rb, g);
115 }
116
117 static force_inline void
118 unpack_565_128_4x128 (__m128i  data,
119                       __m128i* data0,
120                       __m128i* data1,
121                       __m128i* data2,
122                       __m128i* data3)
123 {
124     __m128i lo, hi;
125
126     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
127     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
128
129     lo = unpack_565_to_8888 (lo);
130     hi = unpack_565_to_8888 (hi);
131
132     unpack_128_2x128 (lo, data0, data1);
133     unpack_128_2x128 (hi, data2, data3);
134 }
135
136 static force_inline uint16_t
137 pack_565_32_16 (uint32_t pixel)
138 {
139     return (uint16_t) (((pixel >> 8) & 0xf800) |
140                        ((pixel >> 5) & 0x07e0) |
141                        ((pixel >> 3) & 0x001f));
142 }
143
144 static force_inline __m128i
145 pack_2x128_128 (__m128i lo, __m128i hi)
146 {
147     return _mm_packus_epi16 (lo, hi);
148 }
149
150 static force_inline __m128i
151 pack_565_2x128_128 (__m128i lo, __m128i hi)
152 {
153     __m128i data;
154     __m128i r, g1, g2, b;
155
156     data = pack_2x128_128 (lo, hi);
157
158     r  = _mm_and_si128 (data, mask_565_r);
159     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
160     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
161     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
162
163     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
164 }
165
166 static force_inline __m128i
167 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
168 {
169     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
170                              pack_565_2x128_128 (*xmm2, *xmm3));
171 }
172
173 static force_inline int
174 is_opaque (__m128i x)
175 {
176     __m128i ffs = _mm_cmpeq_epi8 (x, x);
177
178     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
179 }
180
181 static force_inline int
182 is_zero (__m128i x)
183 {
184     return _mm_movemask_epi8 (
185         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
186 }
187
188 static force_inline int
189 is_transparent (__m128i x)
190 {
191     return (_mm_movemask_epi8 (
192                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
193 }
194
195 static force_inline __m128i
196 expand_pixel_32_1x128 (uint32_t data)
197 {
198     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
199 }
200
201 static force_inline __m128i
202 expand_alpha_1x128 (__m128i data)
203 {
204     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
205                                                      _MM_SHUFFLE (3, 3, 3, 3)),
206                                 _MM_SHUFFLE (3, 3, 3, 3));
207 }
208
209 static force_inline void
210 expand_alpha_2x128 (__m128i  data_lo,
211                     __m128i  data_hi,
212                     __m128i* alpha_lo,
213                     __m128i* alpha_hi)
214 {
215     __m128i lo, hi;
216
217     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
218     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
219
220     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
221     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
222 }
223
224 static force_inline void
225 expand_alpha_rev_2x128 (__m128i  data_lo,
226                         __m128i  data_hi,
227                         __m128i* alpha_lo,
228                         __m128i* alpha_hi)
229 {
230     __m128i lo, hi;
231
232     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
233     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
234     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
235     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
236 }
237
238 static force_inline void
239 pix_multiply_2x128 (__m128i* data_lo,
240                     __m128i* data_hi,
241                     __m128i* alpha_lo,
242                     __m128i* alpha_hi,
243                     __m128i* ret_lo,
244                     __m128i* ret_hi)
245 {
246     __m128i lo, hi;
247
248     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
249     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
250     lo = _mm_adds_epu16 (lo, mask_0080);
251     hi = _mm_adds_epu16 (hi, mask_0080);
252     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
253     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
254 }
255
256 static force_inline void
257 pix_add_multiply_2x128 (__m128i* src_lo,
258                         __m128i* src_hi,
259                         __m128i* alpha_dst_lo,
260                         __m128i* alpha_dst_hi,
261                         __m128i* dst_lo,
262                         __m128i* dst_hi,
263                         __m128i* alpha_src_lo,
264                         __m128i* alpha_src_hi,
265                         __m128i* ret_lo,
266                         __m128i* ret_hi)
267 {
268     __m128i t1_lo, t1_hi;
269     __m128i t2_lo, t2_hi;
270
271     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
272     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
273
274     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
275     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
276 }
277
278 static force_inline void
279 negate_2x128 (__m128i  data_lo,
280               __m128i  data_hi,
281               __m128i* neg_lo,
282               __m128i* neg_hi)
283 {
284     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
285     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
286 }
287
288 static force_inline void
289 invert_colors_2x128 (__m128i  data_lo,
290                      __m128i  data_hi,
291                      __m128i* inv_lo,
292                      __m128i* inv_hi)
293 {
294     __m128i lo, hi;
295
296     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
297     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
298     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
299     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
300 }
301
302 static force_inline void
303 over_2x128 (__m128i* src_lo,
304             __m128i* src_hi,
305             __m128i* alpha_lo,
306             __m128i* alpha_hi,
307             __m128i* dst_lo,
308             __m128i* dst_hi)
309 {
310     __m128i t1, t2;
311
312     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
313
314     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
315
316     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
317     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
318 }
319
320 static force_inline void
321 over_rev_non_pre_2x128 (__m128i  src_lo,
322                         __m128i  src_hi,
323                         __m128i* dst_lo,
324                         __m128i* dst_hi)
325 {
326     __m128i lo, hi;
327     __m128i alpha_lo, alpha_hi;
328
329     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
330
331     lo = _mm_or_si128 (alpha_lo, mask_alpha);
332     hi = _mm_or_si128 (alpha_hi, mask_alpha);
333
334     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
335
336     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
337
338     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
339 }
340
341 static force_inline void
342 in_over_2x128 (__m128i* src_lo,
343                __m128i* src_hi,
344                __m128i* alpha_lo,
345                __m128i* alpha_hi,
346                __m128i* mask_lo,
347                __m128i* mask_hi,
348                __m128i* dst_lo,
349                __m128i* dst_hi)
350 {
351     __m128i s_lo, s_hi;
352     __m128i a_lo, a_hi;
353
354     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
355     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
356
357     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
358 }
359
360 /* load 4 pixels from a 16-byte boundary aligned address */
361 static force_inline __m128i
362 load_128_aligned (__m128i* src)
363 {
364     return _mm_load_si128 (src);
365 }
366
367 /* load 4 pixels from a unaligned address */
368 static force_inline __m128i
369 load_128_unaligned (const __m128i* src)
370 {
371     return _mm_loadu_si128 (src);
372 }
373
374 /* save 4 pixels using Write Combining memory on a 16-byte
375  * boundary aligned address
376  */
377 static force_inline void
378 save_128_write_combining (__m128i* dst,
379                           __m128i  data)
380 {
381     _mm_stream_si128 (dst, data);
382 }
383
384 /* save 4 pixels on a 16-byte boundary aligned address */
385 static force_inline void
386 save_128_aligned (__m128i* dst,
387                   __m128i  data)
388 {
389     _mm_store_si128 (dst, data);
390 }
391
392 /* save 4 pixels on a unaligned address */
393 static force_inline void
394 save_128_unaligned (__m128i* dst,
395                     __m128i  data)
396 {
397     _mm_storeu_si128 (dst, data);
398 }
399
400 /* ------------------------------------------------------------------
401  * MMX inlines
402  */
403
404 static force_inline __m64
405 load_32_1x64 (uint32_t data)
406 {
407     return _mm_cvtsi32_si64 (data);
408 }
409
410 static force_inline __m64
411 unpack_32_1x64 (uint32_t data)
412 {
413     return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
414 }
415
416 static force_inline __m64
417 expand_alpha_1x64 (__m64 data)
418 {
419     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
420 }
421
422 static force_inline __m64
423 expand_alpha_rev_1x64 (__m64 data)
424 {
425     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
426 }
427
428 static force_inline __m64
429 expand_pixel_8_1x64 (uint8_t data)
430 {
431     return _mm_shuffle_pi16 (
432         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
433 }
434
435 static force_inline __m64
436 pix_multiply_1x64 (__m64 data,
437                    __m64 alpha)
438 {
439     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
440                                           mask_x0080),
441                            mask_x0101);
442 }
443
444 static force_inline __m64
445 pix_add_multiply_1x64 (__m64* src,
446                        __m64* alpha_dst,
447                        __m64* dst,
448                        __m64* alpha_src)
449 {
450     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
451     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
452
453     return _mm_adds_pu8 (t1, t2);
454 }
455
456 static force_inline __m64
457 negate_1x64 (__m64 data)
458 {
459     return _mm_xor_si64 (data, mask_x00ff);
460 }
461
462 static force_inline __m64
463 invert_colors_1x64 (__m64 data)
464 {
465     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
466 }
467
468 static force_inline __m64
469 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
470 {
471     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
472 }
473
474 static force_inline __m64
475 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
476 {
477     return over_1x64 (pix_multiply_1x64 (*src, *mask),
478                       pix_multiply_1x64 (*alpha, *mask),
479                       *dst);
480 }
481
482 static force_inline __m64
483 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
484 {
485     __m64 alpha = expand_alpha_1x64 (src);
486
487     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
488                                          _mm_or_si64 (alpha, mask_x_alpha)),
489                       alpha,
490                       dst);
491 }
492
493 static force_inline uint32_t
494 pack_1x64_32 (__m64 data)
495 {
496     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
497 }
498
499 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
500  *
501  *    00RR00GG00BB
502  *
503  * --- Expanding 565 in the low word ---
504  *
505  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
506  * m = m & (01f0003f001f);
507  * m = m * (008404100840);
508  * m = m >> 8;
509  *
510  * Note the trick here - the top word is shifted by another nibble to
511  * avoid it bumping into the middle word
512  */
513 static force_inline __m64
514 expand565_16_1x64 (uint16_t pixel)
515 {
516     __m64 p;
517     __m64 t1, t2;
518
519     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
520
521     t1 = _mm_slli_si64 (p, 36 - 11);
522     t2 = _mm_slli_si64 (p, 16 - 5);
523
524     p = _mm_or_si64 (t1, p);
525     p = _mm_or_si64 (t2, p);
526     p = _mm_and_si64 (p, mask_x565_rgb);
527     p = _mm_mullo_pi16 (p, mask_x565_unpack);
528
529     return _mm_srli_pi16 (p, 8);
530 }
531
532 /* ----------------------------------------------------------------------------
533  * Compose Core transformations
534  */
535 static force_inline uint32_t
536 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
537 {
538     uint8_t a;
539     __m64 ms;
540
541     a = src >> 24;
542
543     if (a == 0xff)
544     {
545         return src;
546     }
547     else if (src)
548     {
549         ms = unpack_32_1x64 (src);
550         return pack_1x64_32 (
551             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
552     }
553
554     return dst;
555 }
556
557 static force_inline uint32_t
558 combine1 (const uint32_t *ps, const uint32_t *pm)
559 {
560     uint32_t s = *ps;
561
562     if (pm)
563     {
564         __m64 ms, mm;
565
566         mm = unpack_32_1x64 (*pm);
567         mm = expand_alpha_1x64 (mm);
568
569         ms = unpack_32_1x64 (s);
570         ms = pix_multiply_1x64 (ms, mm);
571
572         s = pack_1x64_32 (ms);
573     }
574
575     return s;
576 }
577
578 static force_inline __m128i
579 combine4 (const __m128i *ps, const __m128i *pm)
580 {
581     __m128i xmm_src_lo, xmm_src_hi;
582     __m128i xmm_msk_lo, xmm_msk_hi;
583     __m128i s;
584
585     if (pm)
586     {
587         xmm_msk_lo = load_128_unaligned (pm);
588
589         if (is_transparent (xmm_msk_lo))
590             return _mm_setzero_si128 ();
591     }
592
593     s = load_128_unaligned (ps);
594
595     if (pm)
596     {
597         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
598         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
599
600         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
601
602         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
603                             &xmm_msk_lo, &xmm_msk_hi,
604                             &xmm_src_lo, &xmm_src_hi);
605
606         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
607     }
608
609     return s;
610 }
611
612 static force_inline void
613 core_combine_over_u_sse2 (uint32_t*       pd,
614                           const uint32_t* ps,
615                           const uint32_t* pm,
616                           int             w)
617 {
618     uint32_t s, d;
619
620     __m128i xmm_dst_lo, xmm_dst_hi;
621     __m128i xmm_src_lo, xmm_src_hi;
622     __m128i xmm_alpha_lo, xmm_alpha_hi;
623
624     /* Align dst on a 16-byte boundary */
625     while (w && ((unsigned long)pd & 15))
626     {
627         d = *pd;
628         s = combine1 (ps, pm);
629
630         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
631         ps++;
632         if (pm)
633             pm++;
634         w--;
635     }
636
637     while (w >= 4)
638     {
639         /* I'm loading unaligned because I'm not sure about
640          * the address alignment.
641          */
642         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
643
644         if (is_opaque (xmm_src_hi))
645         {
646             save_128_aligned ((__m128i*)pd, xmm_src_hi);
647         }
648         else if (!is_zero (xmm_src_hi))
649         {
650             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
651
652             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
653             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
654
655             expand_alpha_2x128 (
656                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
657
658             over_2x128 (&xmm_src_lo, &xmm_src_hi,
659                         &xmm_alpha_lo, &xmm_alpha_hi,
660                         &xmm_dst_lo, &xmm_dst_hi);
661
662             /* rebuid the 4 pixel data and save*/
663             save_128_aligned ((__m128i*)pd,
664                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
665         }
666
667         w -= 4;
668         ps += 4;
669         pd += 4;
670         if (pm)
671             pm += 4;
672     }
673
674     while (w)
675     {
676         d = *pd;
677         s = combine1 (ps, pm);
678
679         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
680         ps++;
681         if (pm)
682             pm++;
683
684         w--;
685     }
686 }
687
688 static force_inline void
689 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
690                                   const uint32_t* ps,
691                                   const uint32_t* pm,
692                                   int             w)
693 {
694     uint32_t s, d;
695
696     __m128i xmm_dst_lo, xmm_dst_hi;
697     __m128i xmm_src_lo, xmm_src_hi;
698     __m128i xmm_alpha_lo, xmm_alpha_hi;
699
700     /* Align dst on a 16-byte boundary */
701     while (w &&
702            ((unsigned long)pd & 15))
703     {
704         d = *pd;
705         s = combine1 (ps, pm);
706
707         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
708         w--;
709         ps++;
710         if (pm)
711             pm++;
712     }
713
714     while (w >= 4)
715     {
716         /* I'm loading unaligned because I'm not sure
717          * about the address alignment.
718          */
719         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
720         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
721
722         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
723         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
724
725         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
726                             &xmm_alpha_lo, &xmm_alpha_hi);
727
728         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
729                     &xmm_alpha_lo, &xmm_alpha_hi,
730                     &xmm_src_lo, &xmm_src_hi);
731
732         /* rebuid the 4 pixel data and save*/
733         save_128_aligned ((__m128i*)pd,
734                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
735
736         w -= 4;
737         ps += 4;
738         pd += 4;
739
740         if (pm)
741             pm += 4;
742     }
743
744     while (w)
745     {
746         d = *pd;
747         s = combine1 (ps, pm);
748
749         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
750         ps++;
751         w--;
752         if (pm)
753             pm++;
754     }
755 }
756
757 static force_inline uint32_t
758 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
759 {
760     uint32_t maska = src >> 24;
761
762     if (maska == 0)
763     {
764         return 0;
765     }
766     else if (maska != 0xff)
767     {
768         return pack_1x64_32 (
769             pix_multiply_1x64 (unpack_32_1x64 (dst),
770                                expand_alpha_1x64 (unpack_32_1x64 (src))));
771     }
772
773     return dst;
774 }
775
776 static force_inline void
777 core_combine_in_u_sse2 (uint32_t*       pd,
778                         const uint32_t* ps,
779                         const uint32_t* pm,
780                         int             w)
781 {
782     uint32_t s, d;
783
784     __m128i xmm_src_lo, xmm_src_hi;
785     __m128i xmm_dst_lo, xmm_dst_hi;
786
787     while (w && ((unsigned long) pd & 15))
788     {
789         s = combine1 (ps, pm);
790         d = *pd;
791
792         *pd++ = core_combine_in_u_pixelsse2 (d, s);
793         w--;
794         ps++;
795         if (pm)
796             pm++;
797     }
798
799     while (w >= 4)
800     {
801         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
802         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
803
804         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
805         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
806
807         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
808         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
809                             &xmm_dst_lo, &xmm_dst_hi,
810                             &xmm_dst_lo, &xmm_dst_hi);
811
812         save_128_aligned ((__m128i*)pd,
813                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
814
815         ps += 4;
816         pd += 4;
817         w -= 4;
818         if (pm)
819             pm += 4;
820     }
821
822     while (w)
823     {
824         s = combine1 (ps, pm);
825         d = *pd;
826
827         *pd++ = core_combine_in_u_pixelsse2 (d, s);
828         w--;
829         ps++;
830         if (pm)
831             pm++;
832     }
833 }
834
835 static force_inline void
836 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
837                                 const uint32_t* ps,
838                                 const uint32_t *pm,
839                                 int             w)
840 {
841     uint32_t s, d;
842
843     __m128i xmm_src_lo, xmm_src_hi;
844     __m128i xmm_dst_lo, xmm_dst_hi;
845
846     while (w && ((unsigned long) pd & 15))
847     {
848         s = combine1 (ps, pm);
849         d = *pd;
850
851         *pd++ = core_combine_in_u_pixelsse2 (s, d);
852         ps++;
853         w--;
854         if (pm)
855             pm++;
856     }
857
858     while (w >= 4)
859     {
860         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
861         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
862
863         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
865
866         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
867         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
868                             &xmm_src_lo, &xmm_src_hi,
869                             &xmm_dst_lo, &xmm_dst_hi);
870
871         save_128_aligned (
872             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
873
874         ps += 4;
875         pd += 4;
876         w -= 4;
877         if (pm)
878             pm += 4;
879     }
880
881     while (w)
882     {
883         s = combine1 (ps, pm);
884         d = *pd;
885
886         *pd++ = core_combine_in_u_pixelsse2 (s, d);
887         w--;
888         ps++;
889         if (pm)
890             pm++;
891     }
892 }
893
894 static force_inline void
895 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
896                                  const uint32_t* ps,
897                                  const uint32_t* pm,
898                                  int             w)
899 {
900     while (w && ((unsigned long) pd & 15))
901     {
902         uint32_t s = combine1 (ps, pm);
903         uint32_t d = *pd;
904
905         *pd++ = pack_1x64_32 (
906             pix_multiply_1x64 (
907                 unpack_32_1x64 (d), negate_1x64 (
908                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
909
910         if (pm)
911             pm++;
912         ps++;
913         w--;
914     }
915
916     while (w >= 4)
917     {
918         __m128i xmm_src_lo, xmm_src_hi;
919         __m128i xmm_dst_lo, xmm_dst_hi;
920
921         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
922         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
923
924         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
925         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
926
927         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
928         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
929
930         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
931                             &xmm_src_lo, &xmm_src_hi,
932                             &xmm_dst_lo, &xmm_dst_hi);
933
934         save_128_aligned (
935             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
936
937         ps += 4;
938         pd += 4;
939         if (pm)
940             pm += 4;
941
942         w -= 4;
943     }
944
945     while (w)
946     {
947         uint32_t s = combine1 (ps, pm);
948         uint32_t d = *pd;
949
950         *pd++ = pack_1x64_32 (
951             pix_multiply_1x64 (
952                 unpack_32_1x64 (d), negate_1x64 (
953                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
954         ps++;
955         if (pm)
956             pm++;
957         w--;
958     }
959 }
960
961 static force_inline void
962 core_combine_out_u_sse2 (uint32_t*       pd,
963                          const uint32_t* ps,
964                          const uint32_t* pm,
965                          int             w)
966 {
967     while (w && ((unsigned long) pd & 15))
968     {
969         uint32_t s = combine1 (ps, pm);
970         uint32_t d = *pd;
971
972         *pd++ = pack_1x64_32 (
973             pix_multiply_1x64 (
974                 unpack_32_1x64 (s), negate_1x64 (
975                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
976         w--;
977         ps++;
978         if (pm)
979             pm++;
980     }
981
982     while (w >= 4)
983     {
984         __m128i xmm_src_lo, xmm_src_hi;
985         __m128i xmm_dst_lo, xmm_dst_hi;
986
987         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
988         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
989
990         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
991         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
992
993         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
994         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
995
996         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
997                             &xmm_dst_lo, &xmm_dst_hi,
998                             &xmm_dst_lo, &xmm_dst_hi);
999
1000         save_128_aligned (
1001             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1002
1003         ps += 4;
1004         pd += 4;
1005         w -= 4;
1006         if (pm)
1007             pm += 4;
1008     }
1009
1010     while (w)
1011     {
1012         uint32_t s = combine1 (ps, pm);
1013         uint32_t d = *pd;
1014
1015         *pd++ = pack_1x64_32 (
1016             pix_multiply_1x64 (
1017                 unpack_32_1x64 (s), negate_1x64 (
1018                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1019         w--;
1020         ps++;
1021         if (pm)
1022             pm++;
1023     }
1024 }
1025
1026 static force_inline uint32_t
1027 core_combine_atop_u_pixel_sse2 (uint32_t src,
1028                                 uint32_t dst)
1029 {
1030     __m64 s = unpack_32_1x64 (src);
1031     __m64 d = unpack_32_1x64 (dst);
1032
1033     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1034     __m64 da = expand_alpha_1x64 (d);
1035
1036     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1037 }
1038
1039 static force_inline void
1040 core_combine_atop_u_sse2 (uint32_t*       pd,
1041                           const uint32_t* ps,
1042                           const uint32_t* pm,
1043                           int             w)
1044 {
1045     uint32_t s, d;
1046
1047     __m128i xmm_src_lo, xmm_src_hi;
1048     __m128i xmm_dst_lo, xmm_dst_hi;
1049     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1050     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1051
1052     while (w && ((unsigned long) pd & 15))
1053     {
1054         s = combine1 (ps, pm);
1055         d = *pd;
1056
1057         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1058         w--;
1059         ps++;
1060         if (pm)
1061             pm++;
1062     }
1063
1064     while (w >= 4)
1065     {
1066         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1067         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1068
1069         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1070         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1071
1072         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1073                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1074         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1075                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1076
1077         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1078                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1079
1080         pix_add_multiply_2x128 (
1081             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1082             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1083             &xmm_dst_lo, &xmm_dst_hi);
1084
1085         save_128_aligned (
1086             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1087
1088         ps += 4;
1089         pd += 4;
1090         w -= 4;
1091         if (pm)
1092             pm += 4;
1093     }
1094
1095     while (w)
1096     {
1097         s = combine1 (ps, pm);
1098         d = *pd;
1099
1100         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1101         w--;
1102         ps++;
1103         if (pm)
1104             pm++;
1105     }
1106 }
1107
1108 static force_inline uint32_t
1109 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1110                                         uint32_t dst)
1111 {
1112     __m64 s = unpack_32_1x64 (src);
1113     __m64 d = unpack_32_1x64 (dst);
1114
1115     __m64 sa = expand_alpha_1x64 (s);
1116     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1117
1118     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1119 }
1120
1121 static force_inline void
1122 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1123                                   const uint32_t* ps,
1124                                   const uint32_t* pm,
1125                                   int             w)
1126 {
1127     uint32_t s, d;
1128
1129     __m128i xmm_src_lo, xmm_src_hi;
1130     __m128i xmm_dst_lo, xmm_dst_hi;
1131     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1132     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1133
1134     while (w && ((unsigned long) pd & 15))
1135     {
1136         s = combine1 (ps, pm);
1137         d = *pd;
1138
1139         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1140         ps++;
1141         w--;
1142         if (pm)
1143             pm++;
1144     }
1145
1146     while (w >= 4)
1147     {
1148         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1149         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1150
1151         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1152         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1153
1154         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1155                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1156         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1157                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1158
1159         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1160                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1161
1162         pix_add_multiply_2x128 (
1163             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1164             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1165             &xmm_dst_lo, &xmm_dst_hi);
1166
1167         save_128_aligned (
1168             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1169
1170         ps += 4;
1171         pd += 4;
1172         w -= 4;
1173         if (pm)
1174             pm += 4;
1175     }
1176
1177     while (w)
1178     {
1179         s = combine1 (ps, pm);
1180         d = *pd;
1181
1182         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1183         ps++;
1184         w--;
1185         if (pm)
1186             pm++;
1187     }
1188 }
1189
1190 static force_inline uint32_t
1191 core_combine_xor_u_pixel_sse2 (uint32_t src,
1192                                uint32_t dst)
1193 {
1194     __m64 s = unpack_32_1x64 (src);
1195     __m64 d = unpack_32_1x64 (dst);
1196
1197     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1198     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1199
1200     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1201 }
1202
1203 static force_inline void
1204 core_combine_xor_u_sse2 (uint32_t*       dst,
1205                          const uint32_t* src,
1206                          const uint32_t *mask,
1207                          int             width)
1208 {
1209     int w = width;
1210     uint32_t s, d;
1211     uint32_t* pd = dst;
1212     const uint32_t* ps = src;
1213     const uint32_t* pm = mask;
1214
1215     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1216     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1217     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1218     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1219
1220     while (w && ((unsigned long) pd & 15))
1221     {
1222         s = combine1 (ps, pm);
1223         d = *pd;
1224
1225         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1226         w--;
1227         ps++;
1228         if (pm)
1229             pm++;
1230     }
1231
1232     while (w >= 4)
1233     {
1234         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1235         xmm_dst = load_128_aligned ((__m128i*) pd);
1236
1237         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1238         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1239
1240         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1241                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1242         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1243                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1244
1245         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1246                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1247         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1248                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1249
1250         pix_add_multiply_2x128 (
1251             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1252             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1253             &xmm_dst_lo, &xmm_dst_hi);
1254
1255         save_128_aligned (
1256             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1257
1258         ps += 4;
1259         pd += 4;
1260         w -= 4;
1261         if (pm)
1262             pm += 4;
1263     }
1264
1265     while (w)
1266     {
1267         s = combine1 (ps, pm);
1268         d = *pd;
1269
1270         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1271         w--;
1272         ps++;
1273         if (pm)
1274             pm++;
1275     }
1276 }
1277
1278 static force_inline void
1279 core_combine_add_u_sse2 (uint32_t*       dst,
1280                          const uint32_t* src,
1281                          const uint32_t* mask,
1282                          int             width)
1283 {
1284     int w = width;
1285     uint32_t s, d;
1286     uint32_t* pd = dst;
1287     const uint32_t* ps = src;
1288     const uint32_t* pm = mask;
1289
1290     while (w && (unsigned long)pd & 15)
1291     {
1292         s = combine1 (ps, pm);
1293         d = *pd;
1294
1295         ps++;
1296         if (pm)
1297             pm++;
1298         *pd++ = _mm_cvtsi64_si32 (
1299             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1300         w--;
1301     }
1302
1303     while (w >= 4)
1304     {
1305         __m128i s;
1306
1307         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1308
1309         save_128_aligned (
1310             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1311
1312         pd += 4;
1313         ps += 4;
1314         if (pm)
1315             pm += 4;
1316         w -= 4;
1317     }
1318
1319     while (w--)
1320     {
1321         s = combine1 (ps, pm);
1322         d = *pd;
1323
1324         ps++;
1325         *pd++ = _mm_cvtsi64_si32 (
1326             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1327         if (pm)
1328             pm++;
1329     }
1330 }
1331
1332 static force_inline uint32_t
1333 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1334                                     uint32_t dst)
1335 {
1336     __m64 ms = unpack_32_1x64 (src);
1337     __m64 md = unpack_32_1x64 (dst);
1338     uint32_t sa = src >> 24;
1339     uint32_t da = ~dst >> 24;
1340
1341     if (sa > da)
1342     {
1343         ms = pix_multiply_1x64 (
1344             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1345     }
1346
1347     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1348 }
1349
1350 static force_inline void
1351 core_combine_saturate_u_sse2 (uint32_t *      pd,
1352                               const uint32_t *ps,
1353                               const uint32_t *pm,
1354                               int             w)
1355 {
1356     uint32_t s, d;
1357
1358     uint32_t pack_cmp;
1359     __m128i xmm_src, xmm_dst;
1360
1361     while (w && (unsigned long)pd & 15)
1362     {
1363         s = combine1 (ps, pm);
1364         d = *pd;
1365
1366         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1367         w--;
1368         ps++;
1369         if (pm)
1370             pm++;
1371     }
1372
1373     while (w >= 4)
1374     {
1375         xmm_dst = load_128_aligned  ((__m128i*)pd);
1376         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1377
1378         pack_cmp = _mm_movemask_epi8 (
1379             _mm_cmpgt_epi32 (
1380                 _mm_srli_epi32 (xmm_src, 24),
1381                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1382
1383         /* if some alpha src is grater than respective ~alpha dst */
1384         if (pack_cmp)
1385         {
1386             s = combine1 (ps++, pm);
1387             d = *pd;
1388             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1389             if (pm)
1390                 pm++;
1391
1392             s = combine1 (ps++, pm);
1393             d = *pd;
1394             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1395             if (pm)
1396                 pm++;
1397
1398             s = combine1 (ps++, pm);
1399             d = *pd;
1400             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1401             if (pm)
1402                 pm++;
1403
1404             s = combine1 (ps++, pm);
1405             d = *pd;
1406             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1407             if (pm)
1408                 pm++;
1409         }
1410         else
1411         {
1412             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1413
1414             pd += 4;
1415             ps += 4;
1416             if (pm)
1417                 pm += 4;
1418         }
1419
1420         w -= 4;
1421     }
1422
1423     while (w--)
1424     {
1425         s = combine1 (ps, pm);
1426         d = *pd;
1427
1428         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1429         ps++;
1430         if (pm)
1431             pm++;
1432     }
1433 }
1434
1435 static force_inline void
1436 core_combine_src_ca_sse2 (uint32_t*       pd,
1437                           const uint32_t* ps,
1438                           const uint32_t *pm,
1439                           int             w)
1440 {
1441     uint32_t s, m;
1442
1443     __m128i xmm_src_lo, xmm_src_hi;
1444     __m128i xmm_mask_lo, xmm_mask_hi;
1445     __m128i xmm_dst_lo, xmm_dst_hi;
1446
1447     while (w && (unsigned long)pd & 15)
1448     {
1449         s = *ps++;
1450         m = *pm++;
1451         *pd++ = pack_1x64_32 (
1452             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1453         w--;
1454     }
1455
1456     while (w >= 4)
1457     {
1458         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1459         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1460
1461         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1462         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1463
1464         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1465                             &xmm_mask_lo, &xmm_mask_hi,
1466                             &xmm_dst_lo, &xmm_dst_hi);
1467
1468         save_128_aligned (
1469             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1470
1471         ps += 4;
1472         pd += 4;
1473         pm += 4;
1474         w -= 4;
1475     }
1476
1477     while (w)
1478     {
1479         s = *ps++;
1480         m = *pm++;
1481         *pd++ = pack_1x64_32 (
1482             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1483         w--;
1484     }
1485 }
1486
1487 static force_inline uint32_t
1488 core_combine_over_ca_pixel_sse2 (uint32_t src,
1489                                  uint32_t mask,
1490                                  uint32_t dst)
1491 {
1492     __m64 s = unpack_32_1x64 (src);
1493     __m64 expAlpha = expand_alpha_1x64 (s);
1494     __m64 unpk_mask = unpack_32_1x64 (mask);
1495     __m64 unpk_dst  = unpack_32_1x64 (dst);
1496
1497     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1498 }
1499
1500 static force_inline void
1501 core_combine_over_ca_sse2 (uint32_t*       pd,
1502                            const uint32_t* ps,
1503                            const uint32_t *pm,
1504                            int             w)
1505 {
1506     uint32_t s, m, d;
1507
1508     __m128i xmm_alpha_lo, xmm_alpha_hi;
1509     __m128i xmm_src_lo, xmm_src_hi;
1510     __m128i xmm_dst_lo, xmm_dst_hi;
1511     __m128i xmm_mask_lo, xmm_mask_hi;
1512
1513     while (w && (unsigned long)pd & 15)
1514     {
1515         s = *ps++;
1516         m = *pm++;
1517         d = *pd;
1518
1519         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1520         w--;
1521     }
1522
1523     while (w >= 4)
1524     {
1525         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1526         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1527         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1528
1529         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1530         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1531         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1532
1533         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1534                             &xmm_alpha_lo, &xmm_alpha_hi);
1535
1536         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1537                        &xmm_alpha_lo, &xmm_alpha_hi,
1538                        &xmm_mask_lo, &xmm_mask_hi,
1539                        &xmm_dst_lo, &xmm_dst_hi);
1540
1541         save_128_aligned (
1542             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1543
1544         ps += 4;
1545         pd += 4;
1546         pm += 4;
1547         w -= 4;
1548     }
1549
1550     while (w)
1551     {
1552         s = *ps++;
1553         m = *pm++;
1554         d = *pd;
1555
1556         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1557         w--;
1558     }
1559 }
1560
1561 static force_inline uint32_t
1562 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1563                                          uint32_t mask,
1564                                          uint32_t dst)
1565 {
1566     __m64 d = unpack_32_1x64 (dst);
1567
1568     return pack_1x64_32 (
1569         over_1x64 (d, expand_alpha_1x64 (d),
1570                    pix_multiply_1x64 (unpack_32_1x64 (src),
1571                                       unpack_32_1x64 (mask))));
1572 }
1573
1574 static force_inline void
1575 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1576                                    const uint32_t* ps,
1577                                    const uint32_t *pm,
1578                                    int             w)
1579 {
1580     uint32_t s, m, d;
1581
1582     __m128i xmm_alpha_lo, xmm_alpha_hi;
1583     __m128i xmm_src_lo, xmm_src_hi;
1584     __m128i xmm_dst_lo, xmm_dst_hi;
1585     __m128i xmm_mask_lo, xmm_mask_hi;
1586
1587     while (w && (unsigned long)pd & 15)
1588     {
1589         s = *ps++;
1590         m = *pm++;
1591         d = *pd;
1592
1593         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1594         w--;
1595     }
1596
1597     while (w >= 4)
1598     {
1599         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1600         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1601         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1602
1603         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1604         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1605         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1606
1607         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1608                             &xmm_alpha_lo, &xmm_alpha_hi);
1609         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1610                             &xmm_mask_lo, &xmm_mask_hi,
1611                             &xmm_mask_lo, &xmm_mask_hi);
1612
1613         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1614                     &xmm_alpha_lo, &xmm_alpha_hi,
1615                     &xmm_mask_lo, &xmm_mask_hi);
1616
1617         save_128_aligned (
1618             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1619
1620         ps += 4;
1621         pd += 4;
1622         pm += 4;
1623         w -= 4;
1624     }
1625
1626     while (w)
1627     {
1628         s = *ps++;
1629         m = *pm++;
1630         d = *pd;
1631
1632         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1633         w--;
1634     }
1635 }
1636
1637 static force_inline void
1638 core_combine_in_ca_sse2 (uint32_t *      pd,
1639                          const uint32_t *ps,
1640                          const uint32_t *pm,
1641                          int             w)
1642 {
1643     uint32_t s, m, d;
1644
1645     __m128i xmm_alpha_lo, xmm_alpha_hi;
1646     __m128i xmm_src_lo, xmm_src_hi;
1647     __m128i xmm_dst_lo, xmm_dst_hi;
1648     __m128i xmm_mask_lo, xmm_mask_hi;
1649
1650     while (w && (unsigned long)pd & 15)
1651     {
1652         s = *ps++;
1653         m = *pm++;
1654         d = *pd;
1655
1656         *pd++ = pack_1x64_32 (
1657             pix_multiply_1x64 (
1658                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1659                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1660
1661         w--;
1662     }
1663
1664     while (w >= 4)
1665     {
1666         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1667         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1668         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1669
1670         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1671         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1672         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1673
1674         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1675                             &xmm_alpha_lo, &xmm_alpha_hi);
1676
1677         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1678                             &xmm_mask_lo, &xmm_mask_hi,
1679                             &xmm_dst_lo, &xmm_dst_hi);
1680
1681         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1682                             &xmm_alpha_lo, &xmm_alpha_hi,
1683                             &xmm_dst_lo, &xmm_dst_hi);
1684
1685         save_128_aligned (
1686             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1687
1688         ps += 4;
1689         pd += 4;
1690         pm += 4;
1691         w -= 4;
1692     }
1693
1694     while (w)
1695     {
1696         s = *ps++;
1697         m = *pm++;
1698         d = *pd;
1699
1700         *pd++ = pack_1x64_32 (
1701             pix_multiply_1x64 (
1702                 pix_multiply_1x64 (
1703                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1704                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1705
1706         w--;
1707     }
1708 }
1709
1710 static force_inline void
1711 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1712                                  const uint32_t *ps,
1713                                  const uint32_t *pm,
1714                                  int             w)
1715 {
1716     uint32_t s, m, d;
1717
1718     __m128i xmm_alpha_lo, xmm_alpha_hi;
1719     __m128i xmm_src_lo, xmm_src_hi;
1720     __m128i xmm_dst_lo, xmm_dst_hi;
1721     __m128i xmm_mask_lo, xmm_mask_hi;
1722
1723     while (w && (unsigned long)pd & 15)
1724     {
1725         s = *ps++;
1726         m = *pm++;
1727         d = *pd;
1728
1729         *pd++ = pack_1x64_32 (
1730             pix_multiply_1x64 (
1731                 unpack_32_1x64 (d),
1732                 pix_multiply_1x64 (unpack_32_1x64 (m),
1733                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1734         w--;
1735     }
1736
1737     while (w >= 4)
1738     {
1739         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1740         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1741         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1742
1743         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1744         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1745         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1746
1747         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1748                             &xmm_alpha_lo, &xmm_alpha_hi);
1749         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1750                             &xmm_alpha_lo, &xmm_alpha_hi,
1751                             &xmm_alpha_lo, &xmm_alpha_hi);
1752
1753         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1754                             &xmm_alpha_lo, &xmm_alpha_hi,
1755                             &xmm_dst_lo, &xmm_dst_hi);
1756
1757         save_128_aligned (
1758             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1759
1760         ps += 4;
1761         pd += 4;
1762         pm += 4;
1763         w -= 4;
1764     }
1765
1766     while (w)
1767     {
1768         s = *ps++;
1769         m = *pm++;
1770         d = *pd;
1771
1772         *pd++ = pack_1x64_32 (
1773             pix_multiply_1x64 (
1774                 unpack_32_1x64 (d),
1775                 pix_multiply_1x64 (unpack_32_1x64 (m),
1776                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1777         w--;
1778     }
1779 }
1780
1781 static force_inline void
1782 core_combine_out_ca_sse2 (uint32_t *      pd,
1783                           const uint32_t *ps,
1784                           const uint32_t *pm,
1785                           int             w)
1786 {
1787     uint32_t s, m, d;
1788
1789     __m128i xmm_alpha_lo, xmm_alpha_hi;
1790     __m128i xmm_src_lo, xmm_src_hi;
1791     __m128i xmm_dst_lo, xmm_dst_hi;
1792     __m128i xmm_mask_lo, xmm_mask_hi;
1793
1794     while (w && (unsigned long)pd & 15)
1795     {
1796         s = *ps++;
1797         m = *pm++;
1798         d = *pd;
1799
1800         *pd++ = pack_1x64_32 (
1801             pix_multiply_1x64 (
1802                 pix_multiply_1x64 (
1803                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1804                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1805         w--;
1806     }
1807
1808     while (w >= 4)
1809     {
1810         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1811         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1812         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1813
1814         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1815         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1816         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1817
1818         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1819                             &xmm_alpha_lo, &xmm_alpha_hi);
1820         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1821                       &xmm_alpha_lo, &xmm_alpha_hi);
1822
1823         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1824                             &xmm_mask_lo, &xmm_mask_hi,
1825                             &xmm_dst_lo, &xmm_dst_hi);
1826         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1827                             &xmm_alpha_lo, &xmm_alpha_hi,
1828                             &xmm_dst_lo, &xmm_dst_hi);
1829
1830         save_128_aligned (
1831             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1832
1833         ps += 4;
1834         pd += 4;
1835         pm += 4;
1836         w -= 4;
1837     }
1838
1839     while (w)
1840     {
1841         s = *ps++;
1842         m = *pm++;
1843         d = *pd;
1844
1845         *pd++ = pack_1x64_32 (
1846             pix_multiply_1x64 (
1847                 pix_multiply_1x64 (
1848                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1849                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1850
1851         w--;
1852     }
1853 }
1854
1855 static force_inline void
1856 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
1857                                   const uint32_t *ps,
1858                                   const uint32_t *pm,
1859                                   int             w)
1860 {
1861     uint32_t s, m, d;
1862
1863     __m128i xmm_alpha_lo, xmm_alpha_hi;
1864     __m128i xmm_src_lo, xmm_src_hi;
1865     __m128i xmm_dst_lo, xmm_dst_hi;
1866     __m128i xmm_mask_lo, xmm_mask_hi;
1867
1868     while (w && (unsigned long)pd & 15)
1869     {
1870         s = *ps++;
1871         m = *pm++;
1872         d = *pd;
1873
1874         *pd++ = pack_1x64_32 (
1875             pix_multiply_1x64 (
1876                 unpack_32_1x64 (d),
1877                 negate_1x64 (pix_multiply_1x64 (
1878                                  unpack_32_1x64 (m),
1879                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
1880         w--;
1881     }
1882
1883     while (w >= 4)
1884     {
1885         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1886         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1887         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1888
1889         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1890         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1891         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1892
1893         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1894                             &xmm_alpha_lo, &xmm_alpha_hi);
1895
1896         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1897                             &xmm_alpha_lo, &xmm_alpha_hi,
1898                             &xmm_mask_lo, &xmm_mask_hi);
1899
1900         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1901                       &xmm_mask_lo, &xmm_mask_hi);
1902
1903         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1904                             &xmm_mask_lo, &xmm_mask_hi,
1905                             &xmm_dst_lo, &xmm_dst_hi);
1906
1907         save_128_aligned (
1908             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1909
1910         ps += 4;
1911         pd += 4;
1912         pm += 4;
1913         w -= 4;
1914     }
1915
1916     while (w)
1917     {
1918         s = *ps++;
1919         m = *pm++;
1920         d = *pd;
1921
1922         *pd++ = pack_1x64_32 (
1923             pix_multiply_1x64 (
1924                 unpack_32_1x64 (d),
1925                 negate_1x64 (pix_multiply_1x64 (
1926                                  unpack_32_1x64 (m),
1927                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
1928         w--;
1929     }
1930 }
1931
1932 static force_inline uint32_t
1933 core_combine_atop_ca_pixel_sse2 (uint32_t src,
1934                                  uint32_t mask,
1935                                  uint32_t dst)
1936 {
1937     __m64 m = unpack_32_1x64 (mask);
1938     __m64 s = unpack_32_1x64 (src);
1939     __m64 d = unpack_32_1x64 (dst);
1940     __m64 sa = expand_alpha_1x64 (s);
1941     __m64 da = expand_alpha_1x64 (d);
1942
1943     s = pix_multiply_1x64 (s, m);
1944     m = negate_1x64 (pix_multiply_1x64 (m, sa));
1945
1946     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
1947 }
1948
1949 static force_inline void
1950 core_combine_atop_ca_sse2 (uint32_t *      pd,
1951                            const uint32_t *ps,
1952                            const uint32_t *pm,
1953                            int             w)
1954 {
1955     uint32_t s, m, d;
1956
1957     __m128i xmm_src_lo, xmm_src_hi;
1958     __m128i xmm_dst_lo, xmm_dst_hi;
1959     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1960     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1961     __m128i xmm_mask_lo, xmm_mask_hi;
1962
1963     while (w && (unsigned long)pd & 15)
1964     {
1965         s = *ps++;
1966         m = *pm++;
1967         d = *pd;
1968
1969         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
1970         w--;
1971     }
1972
1973     while (w >= 4)
1974     {
1975         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1976         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1977         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1978
1979         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1980         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1981         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1982
1983         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1984                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1985         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1986                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1987
1988         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1989                             &xmm_mask_lo, &xmm_mask_hi,
1990                             &xmm_src_lo, &xmm_src_hi);
1991         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1992                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1993                             &xmm_mask_lo, &xmm_mask_hi);
1994
1995         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1996
1997         pix_add_multiply_2x128 (
1998             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
1999             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2000             &xmm_dst_lo, &xmm_dst_hi);
2001
2002         save_128_aligned (
2003             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2004
2005         ps += 4;
2006         pd += 4;
2007         pm += 4;
2008         w -= 4;
2009     }
2010
2011     while (w)
2012     {
2013         s = *ps++;
2014         m = *pm++;
2015         d = *pd;
2016
2017         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2018         w--;
2019     }
2020 }
2021
2022 static force_inline uint32_t
2023 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2024                                          uint32_t mask,
2025                                          uint32_t dst)
2026 {
2027     __m64 m = unpack_32_1x64 (mask);
2028     __m64 s = unpack_32_1x64 (src);
2029     __m64 d = unpack_32_1x64 (dst);
2030
2031     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2032     __m64 sa = expand_alpha_1x64 (s);
2033
2034     s = pix_multiply_1x64 (s, m);
2035     m = pix_multiply_1x64 (m, sa);
2036
2037     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2038 }
2039
2040 static force_inline void
2041 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2042                                    const uint32_t *ps,
2043                                    const uint32_t *pm,
2044                                    int             w)
2045 {
2046     uint32_t s, m, d;
2047
2048     __m128i xmm_src_lo, xmm_src_hi;
2049     __m128i xmm_dst_lo, xmm_dst_hi;
2050     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2051     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2052     __m128i xmm_mask_lo, xmm_mask_hi;
2053
2054     while (w && (unsigned long)pd & 15)
2055     {
2056         s = *ps++;
2057         m = *pm++;
2058         d = *pd;
2059
2060         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2061         w--;
2062     }
2063
2064     while (w >= 4)
2065     {
2066         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2067         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2068         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2069
2070         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2071         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2072         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2073
2074         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2075                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2076         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2077                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2078
2079         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2080                             &xmm_mask_lo, &xmm_mask_hi,
2081                             &xmm_src_lo, &xmm_src_hi);
2082         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2083                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2084                             &xmm_mask_lo, &xmm_mask_hi);
2085
2086         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2087                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2088
2089         pix_add_multiply_2x128 (
2090             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092             &xmm_dst_lo, &xmm_dst_hi);
2093
2094         save_128_aligned (
2095             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2096
2097         ps += 4;
2098         pd += 4;
2099         pm += 4;
2100         w -= 4;
2101     }
2102
2103     while (w)
2104     {
2105         s = *ps++;
2106         m = *pm++;
2107         d = *pd;
2108
2109         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2110         w--;
2111     }
2112 }
2113
2114 static force_inline uint32_t
2115 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2116                                 uint32_t mask,
2117                                 uint32_t dst)
2118 {
2119     __m64 a = unpack_32_1x64 (mask);
2120     __m64 s = unpack_32_1x64 (src);
2121     __m64 d = unpack_32_1x64 (dst);
2122
2123     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2124                                        a, expand_alpha_1x64 (s)));
2125     __m64 dest      = pix_multiply_1x64 (s, a);
2126     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2127
2128     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2129                                                 &alpha_dst,
2130                                                 &dest,
2131                                                 &alpha_src));
2132 }
2133
2134 static force_inline void
2135 core_combine_xor_ca_sse2 (uint32_t *      pd,
2136                           const uint32_t *ps,
2137                           const uint32_t *pm,
2138                           int             w)
2139 {
2140     uint32_t s, m, d;
2141
2142     __m128i xmm_src_lo, xmm_src_hi;
2143     __m128i xmm_dst_lo, xmm_dst_hi;
2144     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2145     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2146     __m128i xmm_mask_lo, xmm_mask_hi;
2147
2148     while (w && (unsigned long)pd & 15)
2149     {
2150         s = *ps++;
2151         m = *pm++;
2152         d = *pd;
2153
2154         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2155         w--;
2156     }
2157
2158     while (w >= 4)
2159     {
2160         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2163
2164         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2167
2168         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2170         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2171                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2172
2173         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2174                             &xmm_mask_lo, &xmm_mask_hi,
2175                             &xmm_src_lo, &xmm_src_hi);
2176         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2177                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2178                             &xmm_mask_lo, &xmm_mask_hi);
2179
2180         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2181                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2183                       &xmm_mask_lo, &xmm_mask_hi);
2184
2185         pix_add_multiply_2x128 (
2186             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2187             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2188             &xmm_dst_lo, &xmm_dst_hi);
2189
2190         save_128_aligned (
2191             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2192
2193         ps += 4;
2194         pd += 4;
2195         pm += 4;
2196         w -= 4;
2197     }
2198
2199     while (w)
2200     {
2201         s = *ps++;
2202         m = *pm++;
2203         d = *pd;
2204
2205         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2206         w--;
2207     }
2208 }
2209
2210 static force_inline void
2211 core_combine_add_ca_sse2 (uint32_t *      pd,
2212                           const uint32_t *ps,
2213                           const uint32_t *pm,
2214                           int             w)
2215 {
2216     uint32_t s, m, d;
2217
2218     __m128i xmm_src_lo, xmm_src_hi;
2219     __m128i xmm_dst_lo, xmm_dst_hi;
2220     __m128i xmm_mask_lo, xmm_mask_hi;
2221
2222     while (w && (unsigned long)pd & 15)
2223     {
2224         s = *ps++;
2225         m = *pm++;
2226         d = *pd;
2227
2228         *pd++ = pack_1x64_32 (
2229             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2230                                              unpack_32_1x64 (m)),
2231                           unpack_32_1x64 (d)));
2232         w--;
2233     }
2234
2235     while (w >= 4)
2236     {
2237         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2238         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2239         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2240
2241         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2242         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2243         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2244
2245         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2246                             &xmm_mask_lo, &xmm_mask_hi,
2247                             &xmm_src_lo, &xmm_src_hi);
2248
2249         save_128_aligned (
2250             (__m128i*)pd, pack_2x128_128 (
2251                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2252                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2253
2254         ps += 4;
2255         pd += 4;
2256         pm += 4;
2257         w -= 4;
2258     }
2259
2260     while (w)
2261     {
2262         s = *ps++;
2263         m = *pm++;
2264         d = *pd;
2265
2266         *pd++ = pack_1x64_32 (
2267             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2268                                              unpack_32_1x64 (m)),
2269                           unpack_32_1x64 (d)));
2270         w--;
2271     }
2272 }
2273
2274 /* ---------------------------------------------------
2275  * fb_compose_setup_sSE2
2276  */
2277 static force_inline __m64
2278 create_mask_16_64 (uint16_t mask)
2279 {
2280     return _mm_set1_pi16 (mask);
2281 }
2282
2283 static force_inline __m128i
2284 create_mask_16_128 (uint16_t mask)
2285 {
2286     return _mm_set1_epi16 (mask);
2287 }
2288
2289 static force_inline __m64
2290 create_mask_2x32_64 (uint32_t mask0,
2291                      uint32_t mask1)
2292 {
2293     return _mm_set_pi32 (mask0, mask1);
2294 }
2295
2296 /* Work around a code generation bug in Sun Studio 12. */
2297 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2298 # define create_mask_2x32_128(mask0, mask1)                             \
2299     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2300 #else
2301 static force_inline __m128i
2302 create_mask_2x32_128 (uint32_t mask0,
2303                       uint32_t mask1)
2304 {
2305     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2306 }
2307 #endif
2308
2309 /* SSE2 code patch for fbcompose.c */
2310
2311 static void
2312 sse2_combine_over_u (pixman_implementation_t *imp,
2313                      pixman_op_t              op,
2314                      uint32_t *               dst,
2315                      const uint32_t *         src,
2316                      const uint32_t *         mask,
2317                      int                      width)
2318 {
2319     core_combine_over_u_sse2 (dst, src, mask, width);
2320     _mm_empty ();
2321 }
2322
2323 static void
2324 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2325                              pixman_op_t              op,
2326                              uint32_t *               dst,
2327                              const uint32_t *         src,
2328                              const uint32_t *         mask,
2329                              int                      width)
2330 {
2331     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2332     _mm_empty ();
2333 }
2334
2335 static void
2336 sse2_combine_in_u (pixman_implementation_t *imp,
2337                    pixman_op_t              op,
2338                    uint32_t *               dst,
2339                    const uint32_t *         src,
2340                    const uint32_t *         mask,
2341                    int                      width)
2342 {
2343     core_combine_in_u_sse2 (dst, src, mask, width);
2344     _mm_empty ();
2345 }
2346
2347 static void
2348 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2349                            pixman_op_t              op,
2350                            uint32_t *               dst,
2351                            const uint32_t *         src,
2352                            const uint32_t *         mask,
2353                            int                      width)
2354 {
2355     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2356     _mm_empty ();
2357 }
2358
2359 static void
2360 sse2_combine_out_u (pixman_implementation_t *imp,
2361                     pixman_op_t              op,
2362                     uint32_t *               dst,
2363                     const uint32_t *         src,
2364                     const uint32_t *         mask,
2365                     int                      width)
2366 {
2367     core_combine_out_u_sse2 (dst, src, mask, width);
2368     _mm_empty ();
2369 }
2370
2371 static void
2372 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2373                             pixman_op_t              op,
2374                             uint32_t *               dst,
2375                             const uint32_t *         src,
2376                             const uint32_t *         mask,
2377                             int                      width)
2378 {
2379     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2380     _mm_empty ();
2381 }
2382
2383 static void
2384 sse2_combine_atop_u (pixman_implementation_t *imp,
2385                      pixman_op_t              op,
2386                      uint32_t *               dst,
2387                      const uint32_t *         src,
2388                      const uint32_t *         mask,
2389                      int                      width)
2390 {
2391     core_combine_atop_u_sse2 (dst, src, mask, width);
2392     _mm_empty ();
2393 }
2394
2395 static void
2396 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2397                              pixman_op_t              op,
2398                              uint32_t *               dst,
2399                              const uint32_t *         src,
2400                              const uint32_t *         mask,
2401                              int                      width)
2402 {
2403     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2404     _mm_empty ();
2405 }
2406
2407 static void
2408 sse2_combine_xor_u (pixman_implementation_t *imp,
2409                     pixman_op_t              op,
2410                     uint32_t *               dst,
2411                     const uint32_t *         src,
2412                     const uint32_t *         mask,
2413                     int                      width)
2414 {
2415     core_combine_xor_u_sse2 (dst, src, mask, width);
2416     _mm_empty ();
2417 }
2418
2419 static void
2420 sse2_combine_add_u (pixman_implementation_t *imp,
2421                     pixman_op_t              op,
2422                     uint32_t *               dst,
2423                     const uint32_t *         src,
2424                     const uint32_t *         mask,
2425                     int                      width)
2426 {
2427     core_combine_add_u_sse2 (dst, src, mask, width);
2428     _mm_empty ();
2429 }
2430
2431 static void
2432 sse2_combine_saturate_u (pixman_implementation_t *imp,
2433                          pixman_op_t              op,
2434                          uint32_t *               dst,
2435                          const uint32_t *         src,
2436                          const uint32_t *         mask,
2437                          int                      width)
2438 {
2439     core_combine_saturate_u_sse2 (dst, src, mask, width);
2440     _mm_empty ();
2441 }
2442
2443 static void
2444 sse2_combine_src_ca (pixman_implementation_t *imp,
2445                      pixman_op_t              op,
2446                      uint32_t *               dst,
2447                      const uint32_t *         src,
2448                      const uint32_t *         mask,
2449                      int                      width)
2450 {
2451     core_combine_src_ca_sse2 (dst, src, mask, width);
2452     _mm_empty ();
2453 }
2454
2455 static void
2456 sse2_combine_over_ca (pixman_implementation_t *imp,
2457                       pixman_op_t              op,
2458                       uint32_t *               dst,
2459                       const uint32_t *         src,
2460                       const uint32_t *         mask,
2461                       int                      width)
2462 {
2463     core_combine_over_ca_sse2 (dst, src, mask, width);
2464     _mm_empty ();
2465 }
2466
2467 static void
2468 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2469                               pixman_op_t              op,
2470                               uint32_t *               dst,
2471                               const uint32_t *         src,
2472                               const uint32_t *         mask,
2473                               int                      width)
2474 {
2475     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2476     _mm_empty ();
2477 }
2478
2479 static void
2480 sse2_combine_in_ca (pixman_implementation_t *imp,
2481                     pixman_op_t              op,
2482                     uint32_t *               dst,
2483                     const uint32_t *         src,
2484                     const uint32_t *         mask,
2485                     int                      width)
2486 {
2487     core_combine_in_ca_sse2 (dst, src, mask, width);
2488     _mm_empty ();
2489 }
2490
2491 static void
2492 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2493                             pixman_op_t              op,
2494                             uint32_t *               dst,
2495                             const uint32_t *         src,
2496                             const uint32_t *         mask,
2497                             int                      width)
2498 {
2499     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2500     _mm_empty ();
2501 }
2502
2503 static void
2504 sse2_combine_out_ca (pixman_implementation_t *imp,
2505                      pixman_op_t              op,
2506                      uint32_t *               dst,
2507                      const uint32_t *         src,
2508                      const uint32_t *         mask,
2509                      int                      width)
2510 {
2511     core_combine_out_ca_sse2 (dst, src, mask, width);
2512     _mm_empty ();
2513 }
2514
2515 static void
2516 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2517                              pixman_op_t              op,
2518                              uint32_t *               dst,
2519                              const uint32_t *         src,
2520                              const uint32_t *         mask,
2521                              int                      width)
2522 {
2523     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2524     _mm_empty ();
2525 }
2526
2527 static void
2528 sse2_combine_atop_ca (pixman_implementation_t *imp,
2529                       pixman_op_t              op,
2530                       uint32_t *               dst,
2531                       const uint32_t *         src,
2532                       const uint32_t *         mask,
2533                       int                      width)
2534 {
2535     core_combine_atop_ca_sse2 (dst, src, mask, width);
2536     _mm_empty ();
2537 }
2538
2539 static void
2540 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2541                               pixman_op_t              op,
2542                               uint32_t *               dst,
2543                               const uint32_t *         src,
2544                               const uint32_t *         mask,
2545                               int                      width)
2546 {
2547     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2548     _mm_empty ();
2549 }
2550
2551 static void
2552 sse2_combine_xor_ca (pixman_implementation_t *imp,
2553                      pixman_op_t              op,
2554                      uint32_t *               dst,
2555                      const uint32_t *         src,
2556                      const uint32_t *         mask,
2557                      int                      width)
2558 {
2559     core_combine_xor_ca_sse2 (dst, src, mask, width);
2560     _mm_empty ();
2561 }
2562
2563 static void
2564 sse2_combine_add_ca (pixman_implementation_t *imp,
2565                      pixman_op_t              op,
2566                      uint32_t *               dst,
2567                      const uint32_t *         src,
2568                      const uint32_t *         mask,
2569                      int                      width)
2570 {
2571     core_combine_add_ca_sse2 (dst, src, mask, width);
2572     _mm_empty ();
2573 }
2574
2575 /* -------------------------------------------------------------------
2576  * composite_over_n_8888
2577  */
2578
2579 static void
2580 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2581                             pixman_op_t              op,
2582                             pixman_image_t *         src_image,
2583                             pixman_image_t *         mask_image,
2584                             pixman_image_t *         dst_image,
2585                             int32_t                  src_x,
2586                             int32_t                  src_y,
2587                             int32_t                  mask_x,
2588                             int32_t                  mask_y,
2589                             int32_t                  dest_x,
2590                             int32_t                  dest_y,
2591                             int32_t                  width,
2592                             int32_t                  height)
2593 {
2594     uint32_t src;
2595     uint32_t    *dst_line, *dst, d;
2596     int32_t w;
2597     int dst_stride;
2598     __m128i xmm_src, xmm_alpha;
2599     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2600
2601     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2602
2603     if (src == 0)
2604         return;
2605
2606     PIXMAN_IMAGE_GET_LINE (
2607         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2608
2609     xmm_src = expand_pixel_32_1x128 (src);
2610     xmm_alpha = expand_alpha_1x128 (xmm_src);
2611
2612     while (height--)
2613     {
2614         dst = dst_line;
2615
2616         dst_line += dst_stride;
2617         w = width;
2618
2619         while (w && (unsigned long)dst & 15)
2620         {
2621             d = *dst;
2622             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2623                                               _mm_movepi64_pi64 (xmm_alpha),
2624                                               unpack_32_1x64 (d)));
2625             w--;
2626         }
2627
2628         while (w >= 4)
2629         {
2630             xmm_dst = load_128_aligned ((__m128i*)dst);
2631
2632             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2633
2634             over_2x128 (&xmm_src, &xmm_src,
2635                         &xmm_alpha, &xmm_alpha,
2636                         &xmm_dst_lo, &xmm_dst_hi);
2637
2638             /* rebuid the 4 pixel data and save*/
2639             save_128_aligned (
2640                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2641
2642             w -= 4;
2643             dst += 4;
2644         }
2645
2646         while (w)
2647         {
2648             d = *dst;
2649             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2650                                               _mm_movepi64_pi64 (xmm_alpha),
2651                                               unpack_32_1x64 (d)));
2652             w--;
2653         }
2654
2655     }
2656     _mm_empty ();
2657 }
2658
2659 /* ---------------------------------------------------------------------
2660  * composite_over_n_0565
2661  */
2662 static void
2663 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2664                             pixman_op_t              op,
2665                             pixman_image_t *         src_image,
2666                             pixman_image_t *         mask_image,
2667                             pixman_image_t *         dst_image,
2668                             int32_t                  src_x,
2669                             int32_t                  src_y,
2670                             int32_t                  mask_x,
2671                             int32_t                  mask_y,
2672                             int32_t                  dest_x,
2673                             int32_t                  dest_y,
2674                             int32_t                  width,
2675                             int32_t                  height)
2676 {
2677     uint32_t src;
2678     uint16_t    *dst_line, *dst, d;
2679     int32_t w;
2680     int dst_stride;
2681     __m128i xmm_src, xmm_alpha;
2682     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2683
2684     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2685
2686     if (src == 0)
2687         return;
2688
2689     PIXMAN_IMAGE_GET_LINE (
2690         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2691
2692     xmm_src = expand_pixel_32_1x128 (src);
2693     xmm_alpha = expand_alpha_1x128 (xmm_src);
2694
2695     while (height--)
2696     {
2697         dst = dst_line;
2698
2699         dst_line += dst_stride;
2700         w = width;
2701
2702         while (w && (unsigned long)dst & 15)
2703         {
2704             d = *dst;
2705
2706             *dst++ = pack_565_32_16 (
2707                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2708                                          _mm_movepi64_pi64 (xmm_alpha),
2709                                          expand565_16_1x64 (d))));
2710             w--;
2711         }
2712
2713         while (w >= 8)
2714         {
2715             xmm_dst = load_128_aligned ((__m128i*)dst);
2716
2717             unpack_565_128_4x128 (xmm_dst,
2718                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2719
2720             over_2x128 (&xmm_src, &xmm_src,
2721                         &xmm_alpha, &xmm_alpha,
2722                         &xmm_dst0, &xmm_dst1);
2723             over_2x128 (&xmm_src, &xmm_src,
2724                         &xmm_alpha, &xmm_alpha,
2725                         &xmm_dst2, &xmm_dst3);
2726
2727             xmm_dst = pack_565_4x128_128 (
2728                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2729
2730             save_128_aligned ((__m128i*)dst, xmm_dst);
2731
2732             dst += 8;
2733             w -= 8;
2734         }
2735
2736         while (w--)
2737         {
2738             d = *dst;
2739             *dst++ = pack_565_32_16 (
2740                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2741                                          _mm_movepi64_pi64 (xmm_alpha),
2742                                          expand565_16_1x64 (d))));
2743         }
2744     }
2745
2746     _mm_empty ();
2747 }
2748
2749 /* ------------------------------
2750  * composite_add_n_8888_8888_ca
2751  */
2752 static void
2753 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2754                                    pixman_op_t              op,
2755                                    pixman_image_t *         src_image,
2756                                    pixman_image_t *         mask_image,
2757                                    pixman_image_t *         dst_image,
2758                                    int32_t                  src_x,
2759                                    int32_t                  src_y,
2760                                    int32_t                  mask_x,
2761                                    int32_t                  mask_y,
2762                                    int32_t                  dest_x,
2763                                    int32_t                  dest_y,
2764                                    int32_t                  width,
2765                                    int32_t                  height)
2766 {
2767     uint32_t src, srca;
2768     uint32_t    *dst_line, d;
2769     uint32_t    *mask_line, m;
2770     uint32_t pack_cmp;
2771     int dst_stride, mask_stride;
2772
2773     __m128i xmm_src, xmm_alpha;
2774     __m128i xmm_dst;
2775     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2776
2777     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2778
2779     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2780     srca = src >> 24;
2781
2782     if (src == 0)
2783         return;
2784
2785     PIXMAN_IMAGE_GET_LINE (
2786         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2787     PIXMAN_IMAGE_GET_LINE (
2788         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2789
2790     xmm_src = _mm_unpacklo_epi8 (
2791         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2792     xmm_alpha = expand_alpha_1x128 (xmm_src);
2793     mmx_src   = _mm_movepi64_pi64 (xmm_src);
2794     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2795
2796     while (height--)
2797     {
2798         int w = width;
2799         const uint32_t *pm = (uint32_t *)mask_line;
2800         uint32_t *pd = (uint32_t *)dst_line;
2801
2802         dst_line += dst_stride;
2803         mask_line += mask_stride;
2804
2805         while (w && (unsigned long)pd & 15)
2806         {
2807             m = *pm++;
2808
2809             if (m)
2810             {
2811                 d = *pd;
2812
2813                 mmx_mask = unpack_32_1x64 (m);
2814                 mmx_dest = unpack_32_1x64 (d);
2815
2816                 *pd = pack_1x64_32 (
2817                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2818             }
2819
2820             pd++;
2821             w--;
2822         }
2823
2824         while (w >= 4)
2825         {
2826             xmm_mask = load_128_unaligned ((__m128i*)pm);
2827
2828             pack_cmp =
2829                 _mm_movemask_epi8 (
2830                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2831
2832             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2833             if (pack_cmp != 0xffff)
2834             {
2835                 xmm_dst = load_128_aligned ((__m128i*)pd);
2836
2837                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2838
2839                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2840                                     &xmm_mask_lo, &xmm_mask_hi,
2841                                     &xmm_mask_lo, &xmm_mask_hi);
2842                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2843
2844                 save_128_aligned (
2845                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2846             }
2847
2848             pd += 4;
2849             pm += 4;
2850             w -= 4;
2851         }
2852
2853         while (w)
2854         {
2855             m = *pm++;
2856
2857             if (m)
2858             {
2859                 d = *pd;
2860
2861                 mmx_mask = unpack_32_1x64 (m);
2862                 mmx_dest = unpack_32_1x64 (d);
2863
2864                 *pd = pack_1x64_32 (
2865                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2866             }
2867
2868             pd++;
2869             w--;
2870         }
2871     }
2872
2873     _mm_empty ();
2874 }
2875
2876 /* ---------------------------------------------------------------------------
2877  * composite_over_n_8888_8888_ca
2878  */
2879
2880 static void
2881 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2882                                     pixman_op_t              op,
2883                                     pixman_image_t *         src_image,
2884                                     pixman_image_t *         mask_image,
2885                                     pixman_image_t *         dst_image,
2886                                     int32_t                  src_x,
2887                                     int32_t                  src_y,
2888                                     int32_t                  mask_x,
2889                                     int32_t                  mask_y,
2890                                     int32_t                  dest_x,
2891                                     int32_t                  dest_y,
2892                                     int32_t                  width,
2893                                     int32_t                  height)
2894 {
2895     uint32_t src;
2896     uint32_t    *dst_line, d;
2897     uint32_t    *mask_line, m;
2898     uint32_t pack_cmp;
2899     int dst_stride, mask_stride;
2900
2901     __m128i xmm_src, xmm_alpha;
2902     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2903     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2904
2905     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2906
2907     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2908
2909     if (src == 0)
2910         return;
2911
2912     PIXMAN_IMAGE_GET_LINE (
2913         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2914     PIXMAN_IMAGE_GET_LINE (
2915         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2916
2917     xmm_src = _mm_unpacklo_epi8 (
2918         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2919     xmm_alpha = expand_alpha_1x128 (xmm_src);
2920     mmx_src   = _mm_movepi64_pi64 (xmm_src);
2921     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2922
2923     while (height--)
2924     {
2925         int w = width;
2926         const uint32_t *pm = (uint32_t *)mask_line;
2927         uint32_t *pd = (uint32_t *)dst_line;
2928
2929         dst_line += dst_stride;
2930         mask_line += mask_stride;
2931
2932         while (w && (unsigned long)pd & 15)
2933         {
2934             m = *pm++;
2935
2936             if (m)
2937             {
2938                 d = *pd;
2939                 mmx_mask = unpack_32_1x64 (m);
2940                 mmx_dest = unpack_32_1x64 (d);
2941
2942                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
2943                                                   &mmx_alpha,
2944                                                   &mmx_mask,
2945                                                   &mmx_dest));
2946             }
2947
2948             pd++;
2949             w--;
2950         }
2951
2952         while (w >= 4)
2953         {
2954             xmm_mask = load_128_unaligned ((__m128i*)pm);
2955
2956             pack_cmp =
2957                 _mm_movemask_epi8 (
2958                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2959
2960             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2961             if (pack_cmp != 0xffff)
2962             {
2963                 xmm_dst = load_128_aligned ((__m128i*)pd);
2964
2965                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2966                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2967
2968                 in_over_2x128 (&xmm_src, &xmm_src,
2969                                &xmm_alpha, &xmm_alpha,
2970                                &xmm_mask_lo, &xmm_mask_hi,
2971                                &xmm_dst_lo, &xmm_dst_hi);
2972
2973                 save_128_aligned (
2974                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2975             }
2976
2977             pd += 4;
2978             pm += 4;
2979             w -= 4;
2980         }
2981
2982         while (w)
2983         {
2984             m = *pm++;
2985
2986             if (m)
2987             {
2988                 d = *pd;
2989                 mmx_mask = unpack_32_1x64 (m);
2990                 mmx_dest = unpack_32_1x64 (d);
2991
2992                 *pd = pack_1x64_32 (
2993                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2994             }
2995
2996             pd++;
2997             w--;
2998         }
2999     }
3000
3001     _mm_empty ();
3002 }
3003
3004 /*---------------------------------------------------------------------
3005  * composite_over_8888_n_8888
3006  */
3007
3008 static void
3009 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3010                                  pixman_op_t              op,
3011                                  pixman_image_t *         src_image,
3012                                  pixman_image_t *         mask_image,
3013                                  pixman_image_t *         dst_image,
3014                                  int32_t                  src_x,
3015                                  int32_t                  src_y,
3016                                  int32_t                  mask_x,
3017                                  int32_t                  mask_y,
3018                                  int32_t                  dest_x,
3019                                  int32_t                  dest_y,
3020                                  int32_t                  width,
3021                                  int32_t                  height)
3022 {
3023     uint32_t    *dst_line, *dst;
3024     uint32_t    *src_line, *src;
3025     uint32_t mask;
3026     int32_t w;
3027     int dst_stride, src_stride;
3028
3029     __m128i xmm_mask;
3030     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3031     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3032     __m128i xmm_alpha_lo, xmm_alpha_hi;
3033
3034     PIXMAN_IMAGE_GET_LINE (
3035         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3036     PIXMAN_IMAGE_GET_LINE (
3037         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3038
3039     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3040
3041     xmm_mask = create_mask_16_128 (mask >> 24);
3042
3043     while (height--)
3044     {
3045         dst = dst_line;
3046         dst_line += dst_stride;
3047         src = src_line;
3048         src_line += src_stride;
3049         w = width;
3050
3051         while (w && (unsigned long)dst & 15)
3052         {
3053             uint32_t s = *src++;
3054             uint32_t d = *dst;
3055
3056             __m64 ms = unpack_32_1x64 (s);
3057             __m64 alpha    = expand_alpha_1x64 (ms);
3058             __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3059             __m64 alpha_dst = unpack_32_1x64 (d);
3060
3061             *dst++ = pack_1x64_32 (
3062                 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3063
3064             w--;
3065         }
3066
3067         while (w >= 4)
3068         {
3069             xmm_src = load_128_unaligned ((__m128i*)src);
3070             xmm_dst = load_128_aligned ((__m128i*)dst);
3071
3072             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3073             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3074             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3075                                 &xmm_alpha_lo, &xmm_alpha_hi);
3076
3077             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3078                            &xmm_alpha_lo, &xmm_alpha_hi,
3079                            &xmm_mask, &xmm_mask,
3080                            &xmm_dst_lo, &xmm_dst_hi);
3081
3082             save_128_aligned (
3083                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3084
3085             dst += 4;
3086             src += 4;
3087             w -= 4;
3088         }
3089
3090         while (w)
3091         {
3092             uint32_t s = *src++;
3093             uint32_t d = *dst;
3094
3095             __m64 ms = unpack_32_1x64 (s);
3096             __m64 alpha = expand_alpha_1x64 (ms);
3097             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3098             __m64 dest  = unpack_32_1x64 (d);
3099
3100             *dst++ = pack_1x64_32 (
3101                 in_over_1x64 (&ms, &alpha, &mask, &dest));
3102
3103             w--;
3104         }
3105     }
3106
3107     _mm_empty ();
3108 }
3109
3110 /*---------------------------------------------------------------------
3111  * composite_over_8888_n_8888
3112  */
3113
3114 static void
3115 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
3116                               pixman_op_t              op,
3117                               pixman_image_t *         src_image,
3118                               pixman_image_t *         mask_image,
3119                               pixman_image_t *         dst_image,
3120                               int32_t                  src_x,
3121                               int32_t                  src_y,
3122                               int32_t                  mask_x,
3123                               int32_t                  mask_y,
3124                               int32_t                  dest_x,
3125                               int32_t                  dest_y,
3126                               int32_t                  width,
3127                               int32_t                  height)
3128 {
3129     uint32_t    *dst_line, *dst;
3130     uint32_t    *src_line, *src;
3131     int32_t w;
3132     int dst_stride, src_stride;
3133
3134
3135     PIXMAN_IMAGE_GET_LINE (
3136         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3137     PIXMAN_IMAGE_GET_LINE (
3138         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3139
3140     while (height--)
3141     {
3142         dst = dst_line;
3143         dst_line += dst_stride;
3144         src = src_line;
3145         src_line += src_stride;
3146         w = width;
3147
3148         while (w && (unsigned long)dst & 15)
3149         {
3150             *dst++ = *src++ | 0xff000000;
3151             w--;
3152         }
3153
3154         while (w >= 16)
3155         {
3156             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
3157             
3158             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
3159             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
3160             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
3161             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
3162             
3163             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
3164             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
3165             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
3166             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
3167             
3168             dst += 16;
3169             src += 16;
3170             w -= 16;
3171         }
3172
3173         while (w)
3174         {
3175             *dst++ = *src++ | 0xff000000;
3176             w--;
3177         }
3178     }
3179
3180     _mm_empty ();
3181 }
3182
3183 /* ---------------------------------------------------------------------
3184  * composite_over_x888_n_8888
3185  */
3186 static void
3187 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3188                                  pixman_op_t              op,
3189                                  pixman_image_t *         src_image,
3190                                  pixman_image_t *         mask_image,
3191                                  pixman_image_t *         dst_image,
3192                                  int32_t                  src_x,
3193                                  int32_t                  src_y,
3194                                  int32_t                  mask_x,
3195                                  int32_t                  mask_y,
3196                                  int32_t                  dest_x,
3197                                  int32_t                  dest_y,
3198                                  int32_t                  width,
3199                                  int32_t                  height)
3200 {
3201     uint32_t    *dst_line, *dst;
3202     uint32_t    *src_line, *src;
3203     uint32_t mask;
3204     int dst_stride, src_stride;
3205     int32_t w;
3206
3207     __m128i xmm_mask, xmm_alpha;
3208     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3209     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3210
3211     PIXMAN_IMAGE_GET_LINE (
3212         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3213     PIXMAN_IMAGE_GET_LINE (
3214         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3215
3216     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3217
3218     xmm_mask = create_mask_16_128 (mask >> 24);
3219     xmm_alpha = mask_00ff;
3220
3221     while (height--)
3222     {
3223         dst = dst_line;
3224         dst_line += dst_stride;
3225         src = src_line;
3226         src_line += src_stride;
3227         w = width;
3228
3229         while (w && (unsigned long)dst & 15)
3230         {
3231             uint32_t s = (*src++) | 0xff000000;
3232             uint32_t d = *dst;
3233
3234             __m64 src   = unpack_32_1x64 (s);
3235             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3236             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3237             __m64 dest  = unpack_32_1x64 (d);
3238
3239             *dst++ = pack_1x64_32 (
3240                 in_over_1x64 (&src, &alpha, &mask, &dest));
3241
3242             w--;
3243         }
3244
3245         while (w >= 4)
3246         {
3247             xmm_src = _mm_or_si128 (
3248                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3249             xmm_dst = load_128_aligned ((__m128i*)dst);
3250
3251             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3252             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3253
3254             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3255                            &xmm_alpha, &xmm_alpha,
3256                            &xmm_mask, &xmm_mask,
3257                            &xmm_dst_lo, &xmm_dst_hi);
3258
3259             save_128_aligned (
3260                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3261
3262             dst += 4;
3263             src += 4;
3264             w -= 4;
3265
3266         }
3267
3268         while (w)
3269         {
3270             uint32_t s = (*src++) | 0xff000000;
3271             uint32_t d = *dst;
3272
3273             __m64 src  = unpack_32_1x64 (s);
3274             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3275             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3276             __m64 dest  = unpack_32_1x64 (d);
3277
3278             *dst++ = pack_1x64_32 (
3279                 in_over_1x64 (&src, &alpha, &mask, &dest));
3280
3281             w--;
3282         }
3283     }
3284
3285     _mm_empty ();
3286 }
3287
3288 /* --------------------------------------------------------------------
3289  * composite_over_8888_8888
3290  */
3291 static void
3292 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3293                                pixman_op_t              op,
3294                                pixman_image_t *         src_image,
3295                                pixman_image_t *         mask_image,
3296                                pixman_image_t *         dst_image,
3297                                int32_t                  src_x,
3298                                int32_t                  src_y,
3299                                int32_t                  mask_x,
3300                                int32_t                  mask_y,
3301                                int32_t                  dest_x,
3302                                int32_t                  dest_y,
3303                                int32_t                  width,
3304                                int32_t                  height)
3305 {
3306     int dst_stride, src_stride;
3307     uint32_t    *dst_line, *dst;
3308     uint32_t    *src_line, *src;
3309
3310     PIXMAN_IMAGE_GET_LINE (
3311         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3312     PIXMAN_IMAGE_GET_LINE (
3313         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3314
3315     dst = dst_line;
3316     src = src_line;
3317
3318     while (height--)
3319     {
3320         core_combine_over_u_sse2 (dst, src, NULL, width);
3321
3322         dst += dst_stride;
3323         src += src_stride;
3324     }
3325     _mm_empty ();
3326 }
3327
3328 /* ------------------------------------------------------------------
3329  * composite_over_8888_0565
3330  */
3331 static force_inline uint16_t
3332 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3333 {
3334     __m64 ms;
3335
3336     ms = unpack_32_1x64 (src);
3337     return pack_565_32_16 (
3338         pack_1x64_32 (
3339             over_1x64 (
3340                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3341 }
3342
3343 static void
3344 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3345                                pixman_op_t              op,
3346                                pixman_image_t *         src_image,
3347                                pixman_image_t *         mask_image,
3348                                pixman_image_t *         dst_image,
3349                                int32_t                  src_x,
3350                                int32_t                  src_y,
3351                                int32_t                  mask_x,
3352                                int32_t                  mask_y,
3353                                int32_t                  dest_x,
3354                                int32_t                  dest_y,
3355                                int32_t                  width,
3356                                int32_t                  height)
3357 {
3358     uint16_t    *dst_line, *dst, d;
3359     uint32_t    *src_line, *src, s;
3360     int dst_stride, src_stride;
3361     int32_t w;
3362
3363     __m128i xmm_alpha_lo, xmm_alpha_hi;
3364     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3365     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3366
3367     PIXMAN_IMAGE_GET_LINE (
3368         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3369     PIXMAN_IMAGE_GET_LINE (
3370         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3371
3372 #if 0
3373     /* FIXME
3374      *
3375      * I copy the code from MMX one and keep the fixme.
3376      * If it's a problem there, probably is a problem here.
3377      */
3378     assert (src_image->drawable == mask_image->drawable);
3379 #endif
3380
3381     while (height--)
3382     {
3383         dst = dst_line;
3384         src = src_line;
3385
3386         dst_line += dst_stride;
3387         src_line += src_stride;
3388         w = width;
3389
3390         /* Align dst on a 16-byte boundary */
3391         while (w &&
3392                ((unsigned long)dst & 15))
3393         {
3394             s = *src++;
3395             d = *dst;
3396
3397             *dst++ = composite_over_8888_0565pixel (s, d);
3398             w--;
3399         }
3400
3401         /* It's a 8 pixel loop */
3402         while (w >= 8)
3403         {
3404             /* I'm loading unaligned because I'm not sure
3405              * about the address alignment.
3406              */
3407             xmm_src = load_128_unaligned ((__m128i*) src);
3408             xmm_dst = load_128_aligned ((__m128i*) dst);
3409
3410             /* Unpacking */
3411             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3412             unpack_565_128_4x128 (xmm_dst,
3413                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3414             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3415                                 &xmm_alpha_lo, &xmm_alpha_hi);
3416
3417             /* I'm loading next 4 pixels from memory
3418              * before to optimze the memory read.
3419              */
3420             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3421
3422             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3423                         &xmm_alpha_lo, &xmm_alpha_hi,
3424                         &xmm_dst0, &xmm_dst1);
3425
3426             /* Unpacking */
3427             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3428             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3429                                 &xmm_alpha_lo, &xmm_alpha_hi);
3430
3431             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3432                         &xmm_alpha_lo, &xmm_alpha_hi,
3433                         &xmm_dst2, &xmm_dst3);
3434
3435             save_128_aligned (
3436                 (__m128i*)dst, pack_565_4x128_128 (
3437                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3438
3439             w -= 8;
3440             dst += 8;
3441             src += 8;
3442         }
3443
3444         while (w--)
3445         {
3446             s = *src++;
3447             d = *dst;
3448
3449             *dst++ = composite_over_8888_0565pixel (s, d);
3450         }
3451     }
3452
3453     _mm_empty ();
3454 }
3455
3456 /* -----------------------------------------------------------------
3457  * composite_over_n_8_8888
3458  */
3459
3460 static void
3461 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3462                               pixman_op_t              op,
3463                               pixman_image_t *         src_image,
3464                               pixman_image_t *         mask_image,
3465                               pixman_image_t *         dst_image,
3466                               int32_t                  src_x,
3467                               int32_t                  src_y,
3468                               int32_t                  mask_x,
3469                               int32_t                  mask_y,
3470                               int32_t                  dest_x,
3471                               int32_t                  dest_y,
3472                               int32_t                  width,
3473                               int32_t                  height)
3474 {
3475     uint32_t src, srca;
3476     uint32_t *dst_line, *dst;
3477     uint8_t *mask_line, *mask;
3478     int dst_stride, mask_stride;
3479     int32_t w;
3480     uint32_t m, d;
3481
3482     __m128i xmm_src, xmm_alpha, xmm_def;
3483     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3484     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3485
3486     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3487
3488     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3489
3490     srca = src >> 24;
3491     if (src == 0)
3492         return;
3493
3494     PIXMAN_IMAGE_GET_LINE (
3495         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3496     PIXMAN_IMAGE_GET_LINE (
3497         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3498
3499     xmm_def = create_mask_2x32_128 (src, src);
3500     xmm_src = expand_pixel_32_1x128 (src);
3501     xmm_alpha = expand_alpha_1x128 (xmm_src);
3502     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3503     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3504
3505     while (height--)
3506     {
3507         dst = dst_line;
3508         dst_line += dst_stride;
3509         mask = mask_line;
3510         mask_line += mask_stride;
3511         w = width;
3512
3513         while (w && (unsigned long)dst & 15)
3514         {
3515             uint8_t m = *mask++;
3516
3517             if (m)
3518             {
3519                 d = *dst;
3520                 mmx_mask = expand_pixel_8_1x64 (m);
3521                 mmx_dest = unpack_32_1x64 (d);
3522
3523                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3524                                                    &mmx_alpha,
3525                                                    &mmx_mask,
3526                                                    &mmx_dest));
3527             }
3528
3529             w--;
3530             dst++;
3531         }
3532
3533         while (w >= 4)
3534         {
3535             m = *((uint32_t*)mask);
3536
3537             if (srca == 0xff && m == 0xffffffff)
3538             {
3539                 save_128_aligned ((__m128i*)dst, xmm_def);
3540             }
3541             else if (m)
3542             {
3543                 xmm_dst = load_128_aligned ((__m128i*) dst);
3544                 xmm_mask = unpack_32_1x128 (m);
3545                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3546
3547                 /* Unpacking */
3548                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3549                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3550
3551                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3552                                         &xmm_mask_lo, &xmm_mask_hi);
3553
3554                 in_over_2x128 (&xmm_src, &xmm_src,
3555                                &xmm_alpha, &xmm_alpha,
3556                                &xmm_mask_lo, &xmm_mask_hi,
3557                                &xmm_dst_lo, &xmm_dst_hi);
3558
3559                 save_128_aligned (
3560                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3561             }
3562
3563             w -= 4;
3564             dst += 4;
3565             mask += 4;
3566         }
3567
3568         while (w)
3569         {
3570             uint8_t m = *mask++;
3571
3572             if (m)
3573             {
3574                 d = *dst;
3575                 mmx_mask = expand_pixel_8_1x64 (m);
3576                 mmx_dest = unpack_32_1x64 (d);
3577
3578                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3579                                                    &mmx_alpha,
3580                                                    &mmx_mask,
3581                                                    &mmx_dest));
3582             }
3583
3584             w--;
3585             dst++;
3586         }
3587     }
3588
3589     _mm_empty ();
3590 }
3591
3592 /* ----------------------------------------------------------------
3593  * composite_over_n_8_8888
3594  */
3595
3596 pixman_bool_t
3597 pixman_fill_sse2 (uint32_t *bits,
3598                   int       stride,
3599                   int       bpp,
3600                   int       x,
3601                   int       y,
3602                   int       width,
3603                   int       height,
3604                   uint32_t  data)
3605 {
3606     uint32_t byte_width;
3607     uint8_t         *byte_line;
3608
3609     __m128i xmm_def;
3610
3611     if (bpp == 8)
3612     {
3613         uint8_t b;
3614         uint16_t w;
3615
3616         stride = stride * (int) sizeof (uint32_t) / 1;
3617         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3618         byte_width = width;
3619         stride *= 1;
3620
3621         b = data & 0xff;
3622         w = (b << 8) | b;
3623         data = (w << 16) | w;
3624     }
3625     else if (bpp == 16)
3626     {
3627         stride = stride * (int) sizeof (uint32_t) / 2;
3628         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3629         byte_width = 2 * width;
3630         stride *= 2;
3631
3632         data = (data & 0xffff) * 0x00010001;
3633     }
3634     else if (bpp == 32)
3635     {
3636         stride = stride * (int) sizeof (uint32_t) / 4;
3637         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3638         byte_width = 4 * width;
3639         stride *= 4;
3640     }
3641     else
3642     {
3643         return FALSE;
3644     }
3645
3646     xmm_def = create_mask_2x32_128 (data, data);
3647
3648     while (height--)
3649     {
3650         int w;
3651         uint8_t *d = byte_line;
3652         byte_line += stride;
3653         w = byte_width;
3654
3655         while (w >= 1 && ((unsigned long)d & 1))
3656         {
3657             *(uint8_t *)d = data;
3658             w -= 1;
3659             d += 1;
3660         }
3661
3662         while (w >= 2 && ((unsigned long)d & 3))
3663         {
3664             *(uint16_t *)d = data;
3665             w -= 2;
3666             d += 2;
3667         }
3668
3669         while (w >= 4 && ((unsigned long)d & 15))
3670         {
3671             *(uint32_t *)d = data;
3672
3673             w -= 4;
3674             d += 4;
3675         }
3676
3677         while (w >= 128)
3678         {
3679             save_128_aligned ((__m128i*)(d),     xmm_def);
3680             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3681             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3682             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3683             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3684             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3685             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3686             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3687
3688             d += 128;
3689             w -= 128;
3690         }
3691
3692         if (w >= 64)
3693         {
3694             save_128_aligned ((__m128i*)(d),     xmm_def);
3695             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3696             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3697             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3698
3699             d += 64;
3700             w -= 64;
3701         }
3702
3703         if (w >= 32)
3704         {
3705             save_128_aligned ((__m128i*)(d),     xmm_def);
3706             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3707
3708             d += 32;
3709             w -= 32;
3710         }
3711
3712         if (w >= 16)
3713         {
3714             save_128_aligned ((__m128i*)(d),     xmm_def);
3715
3716             d += 16;
3717             w -= 16;
3718         }
3719
3720         while (w >= 4)
3721         {
3722             *(uint32_t *)d = data;
3723
3724             w -= 4;
3725             d += 4;
3726         }
3727
3728         if (w >= 2)
3729         {
3730             *(uint16_t *)d = data;
3731             w -= 2;
3732             d += 2;
3733         }
3734
3735         if (w >= 1)
3736         {
3737             *(uint8_t *)d = data;
3738             w -= 1;
3739             d += 1;
3740         }
3741     }
3742
3743     _mm_empty ();
3744     return TRUE;
3745 }
3746
3747 static void
3748 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3749                              pixman_op_t              op,
3750                              pixman_image_t *         src_image,
3751                              pixman_image_t *         mask_image,
3752                              pixman_image_t *         dst_image,
3753                              int32_t                  src_x,
3754                              int32_t                  src_y,
3755                              int32_t                  mask_x,
3756                              int32_t                  mask_y,
3757                              int32_t                  dest_x,
3758                              int32_t                  dest_y,
3759                              int32_t                  width,
3760                              int32_t                  height)
3761 {
3762     uint32_t src, srca;
3763     uint32_t    *dst_line, *dst;
3764     uint8_t     *mask_line, *mask;
3765     int dst_stride, mask_stride;
3766     int32_t w;
3767     uint32_t m;
3768
3769     __m128i xmm_src, xmm_def;
3770     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3771
3772     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3773
3774     srca = src >> 24;
3775     if (src == 0)
3776     {
3777         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3778                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
3779                           dest_x, dest_y, width, height, 0);
3780         return;
3781     }
3782
3783     PIXMAN_IMAGE_GET_LINE (
3784         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3785     PIXMAN_IMAGE_GET_LINE (
3786         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3787
3788     xmm_def = create_mask_2x32_128 (src, src);
3789     xmm_src = expand_pixel_32_1x128 (src);
3790
3791     while (height--)
3792     {
3793         dst = dst_line;
3794         dst_line += dst_stride;
3795         mask = mask_line;
3796         mask_line += mask_stride;
3797         w = width;
3798
3799         while (w && (unsigned long)dst & 15)
3800         {
3801             uint8_t m = *mask++;
3802
3803             if (m)
3804             {
3805                 *dst = pack_1x64_32 (
3806                     pix_multiply_1x64 (
3807                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3808             }
3809             else
3810             {
3811                 *dst = 0;
3812             }
3813
3814             w--;
3815             dst++;
3816         }
3817
3818         while (w >= 4)
3819         {
3820             m = *((uint32_t*)mask);
3821
3822             if (srca == 0xff && m == 0xffffffff)
3823             {
3824                 save_128_aligned ((__m128i*)dst, xmm_def);
3825             }
3826             else if (m)
3827             {
3828                 xmm_mask = unpack_32_1x128 (m);
3829                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3830
3831                 /* Unpacking */
3832                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3833
3834                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3835                                         &xmm_mask_lo, &xmm_mask_hi);
3836
3837                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3838                                     &xmm_mask_lo, &xmm_mask_hi,
3839                                     &xmm_mask_lo, &xmm_mask_hi);
3840
3841                 save_128_aligned (
3842                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3843             }
3844             else
3845             {
3846                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3847             }
3848
3849             w -= 4;
3850             dst += 4;
3851             mask += 4;
3852         }
3853
3854         while (w)
3855         {
3856             uint8_t m = *mask++;
3857
3858             if (m)
3859             {
3860                 *dst = pack_1x64_32 (
3861                     pix_multiply_1x64 (
3862                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3863             }
3864             else
3865             {
3866                 *dst = 0;
3867             }
3868
3869             w--;
3870             dst++;
3871         }
3872     }
3873
3874     _mm_empty ();
3875 }
3876
3877 /*-----------------------------------------------------------------------
3878  * composite_over_n_8_0565
3879  */
3880
3881 static void
3882 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3883                               pixman_op_t              op,
3884                               pixman_image_t *         src_image,
3885                               pixman_image_t *         mask_image,
3886                               pixman_image_t *         dst_image,
3887                               int32_t                  src_x,
3888                               int32_t                  src_y,
3889                               int32_t                  mask_x,
3890                               int32_t                  mask_y,
3891                               int32_t                  dest_x,
3892                               int32_t                  dest_y,
3893                               int32_t                  width,
3894                               int32_t                  height)
3895 {
3896     uint32_t src, srca;
3897     uint16_t    *dst_line, *dst, d;
3898     uint8_t     *mask_line, *mask;
3899     int dst_stride, mask_stride;
3900     int32_t w;
3901     uint32_t m;
3902     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3903
3904     __m128i xmm_src, xmm_alpha;
3905     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3906     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3907
3908     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3909
3910     srca = src >> 24;
3911     if (src == 0)
3912         return;
3913
3914     PIXMAN_IMAGE_GET_LINE (
3915         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3916     PIXMAN_IMAGE_GET_LINE (
3917         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3918
3919     xmm_src = expand_pixel_32_1x128 (src);
3920     xmm_alpha = expand_alpha_1x128 (xmm_src);
3921     mmx_src = _mm_movepi64_pi64 (xmm_src);
3922     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3923
3924     while (height--)
3925     {
3926         dst = dst_line;
3927         dst_line += dst_stride;
3928         mask = mask_line;
3929         mask_line += mask_stride;
3930         w = width;
3931
3932         while (w && (unsigned long)dst & 15)
3933         {
3934             m = *mask++;
3935
3936             if (m)
3937             {
3938                 d = *dst;
3939                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
3940                 mmx_dest = expand565_16_1x64 (d);
3941
3942                 *dst = pack_565_32_16 (
3943                     pack_1x64_32 (
3944                         in_over_1x64 (
3945                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3946             }
3947
3948             w--;
3949             dst++;
3950         }
3951
3952         while (w >= 8)
3953         {
3954             xmm_dst = load_128_aligned ((__m128i*) dst);
3955             unpack_565_128_4x128 (xmm_dst,
3956                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3957
3958             m = *((uint32_t*)mask);
3959             mask += 4;
3960
3961             if (m)
3962             {
3963                 xmm_mask = unpack_32_1x128 (m);
3964                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3965
3966                 /* Unpacking */
3967                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3968
3969                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3970                                         &xmm_mask_lo, &xmm_mask_hi);
3971
3972                 in_over_2x128 (&xmm_src, &xmm_src,
3973                                &xmm_alpha, &xmm_alpha,
3974                                &xmm_mask_lo, &xmm_mask_hi,
3975                                &xmm_dst0, &xmm_dst1);
3976             }
3977
3978             m = *((uint32_t*)mask);
3979             mask += 4;
3980
3981             if (m)
3982             {
3983                 xmm_mask = unpack_32_1x128 (m);
3984                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3985
3986                 /* Unpacking */
3987                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3988
3989                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3990                                         &xmm_mask_lo, &xmm_mask_hi);
3991                 in_over_2x128 (&xmm_src, &xmm_src,
3992                                &xmm_alpha, &xmm_alpha,
3993                                &xmm_mask_lo, &xmm_mask_hi,
3994                                &xmm_dst2, &xmm_dst3);
3995             }
3996
3997             save_128_aligned (
3998                 (__m128i*)dst, pack_565_4x128_128 (
3999                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4000
4001             w -= 8;
4002             dst += 8;
4003         }
4004
4005         while (w)
4006         {
4007             m = *mask++;
4008
4009             if (m)
4010             {
4011                 d = *dst;
4012                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4013                 mmx_dest = expand565_16_1x64 (d);
4014
4015                 *dst = pack_565_32_16 (
4016                     pack_1x64_32 (
4017                         in_over_1x64 (
4018                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4019             }
4020
4021             w--;
4022             dst++;
4023         }
4024     }
4025
4026     _mm_empty ();
4027 }
4028
4029 /* -----------------------------------------------------------------------
4030  * composite_over_pixbuf_0565
4031  */
4032
4033 static void
4034 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4035                                  pixman_op_t              op,
4036                                  pixman_image_t *         src_image,
4037                                  pixman_image_t *         mask_image,
4038                                  pixman_image_t *         dst_image,
4039                                  int32_t                  src_x,
4040                                  int32_t                  src_y,
4041                                  int32_t                  mask_x,
4042                                  int32_t                  mask_y,
4043                                  int32_t                  dest_x,
4044                                  int32_t                  dest_y,
4045                                  int32_t                  width,
4046                                  int32_t                  height)
4047 {
4048     uint16_t    *dst_line, *dst, d;
4049     uint32_t    *src_line, *src, s;
4050     int dst_stride, src_stride;
4051     int32_t w;
4052     uint32_t opaque, zero;
4053
4054     __m64 ms;
4055     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4056     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4057
4058     PIXMAN_IMAGE_GET_LINE (
4059         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4060     PIXMAN_IMAGE_GET_LINE (
4061         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4062
4063 #if 0
4064     /* FIXME
4065      *
4066      * I copy the code from MMX one and keep the fixme.
4067      * If it's a problem there, probably is a problem here.
4068      */
4069     assert (src_image->drawable == mask_image->drawable);
4070 #endif
4071
4072     while (height--)
4073     {
4074         dst = dst_line;
4075         dst_line += dst_stride;
4076         src = src_line;
4077         src_line += src_stride;
4078         w = width;
4079
4080         while (w && (unsigned long)dst & 15)
4081         {
4082             s = *src++;
4083             d = *dst;
4084
4085             ms = unpack_32_1x64 (s);
4086
4087             *dst++ = pack_565_32_16 (
4088                 pack_1x64_32 (
4089                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4090             w--;
4091         }
4092
4093         while (w >= 8)
4094         {
4095             /* First round */
4096             xmm_src = load_128_unaligned ((__m128i*)src);
4097             xmm_dst = load_128_aligned  ((__m128i*)dst);
4098
4099             opaque = is_opaque (xmm_src);
4100             zero = is_zero (xmm_src);
4101
4102             unpack_565_128_4x128 (xmm_dst,
4103                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4104             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4105
4106             /* preload next round*/
4107             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4108
4109             if (opaque)
4110             {
4111                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4112                                      &xmm_dst0, &xmm_dst1);
4113             }
4114             else if (!zero)
4115             {
4116                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4117                                         &xmm_dst0, &xmm_dst1);
4118             }
4119
4120             /* Second round */
4121             opaque = is_opaque (xmm_src);
4122             zero = is_zero (xmm_src);
4123
4124             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4125
4126             if (opaque)
4127             {
4128                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4129                                      &xmm_dst2, &xmm_dst3);
4130             }
4131             else if (!zero)
4132             {
4133                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4134                                         &xmm_dst2, &xmm_dst3);
4135             }
4136
4137             save_128_aligned (
4138                 (__m128i*)dst, pack_565_4x128_128 (
4139                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4140
4141             w -= 8;
4142             src += 8;
4143             dst += 8;
4144         }
4145
4146         while (w)
4147         {
4148             s = *src++;
4149             d = *dst;
4150
4151             ms = unpack_32_1x64 (s);
4152
4153             *dst++ = pack_565_32_16 (
4154                 pack_1x64_32 (
4155                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4156             w--;
4157         }
4158     }
4159
4160     _mm_empty ();
4161 }
4162
4163 /* -------------------------------------------------------------------------
4164  * composite_over_pixbuf_8888
4165  */
4166
4167 static void
4168 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4169                                  pixman_op_t              op,
4170                                  pixman_image_t *         src_image,
4171                                  pixman_image_t *         mask_image,
4172                                  pixman_image_t *         dst_image,
4173                                  int32_t                  src_x,
4174                                  int32_t                  src_y,
4175                                  int32_t                  mask_x,
4176                                  int32_t                  mask_y,
4177                                  int32_t                  dest_x,
4178                                  int32_t                  dest_y,
4179                                  int32_t                  width,
4180                                  int32_t                  height)
4181 {
4182     uint32_t    *dst_line, *dst, d;
4183     uint32_t    *src_line, *src, s;
4184     int dst_stride, src_stride;
4185     int32_t w;
4186     uint32_t opaque, zero;
4187
4188     __m128i xmm_src_lo, xmm_src_hi;
4189     __m128i xmm_dst_lo, xmm_dst_hi;
4190
4191     PIXMAN_IMAGE_GET_LINE (
4192         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4193     PIXMAN_IMAGE_GET_LINE (
4194         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4195
4196 #if 0
4197     /* FIXME
4198      *
4199      * I copy the code from MMX one and keep the fixme.
4200      * If it's a problem there, probably is a problem here.
4201      */
4202     assert (src_image->drawable == mask_image->drawable);
4203 #endif
4204
4205     while (height--)
4206     {
4207         dst = dst_line;
4208         dst_line += dst_stride;
4209         src = src_line;
4210         src_line += src_stride;
4211         w = width;
4212
4213         while (w && (unsigned long)dst & 15)
4214         {
4215             s = *src++;
4216             d = *dst;
4217
4218             *dst++ = pack_1x64_32 (
4219                 over_rev_non_pre_1x64 (
4220                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4221
4222             w--;
4223         }
4224
4225         while (w >= 4)
4226         {
4227             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4228
4229             opaque = is_opaque (xmm_src_hi);
4230             zero = is_zero (xmm_src_hi);
4231
4232             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4233
4234             if (opaque)
4235             {
4236                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4237                                      &xmm_dst_lo, &xmm_dst_hi);
4238
4239                 save_128_aligned (
4240                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4241             }
4242             else if (!zero)
4243             {
4244                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4245
4246                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4247
4248                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4249                                         &xmm_dst_lo, &xmm_dst_hi);
4250
4251                 save_128_aligned (
4252                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4253             }
4254
4255             w -= 4;
4256             dst += 4;
4257             src += 4;
4258         }
4259
4260         while (w)
4261         {
4262             s = *src++;
4263             d = *dst;
4264
4265             *dst++ = pack_1x64_32 (
4266                 over_rev_non_pre_1x64 (
4267                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4268
4269             w--;
4270         }
4271     }
4272
4273     _mm_empty ();
4274 }
4275
4276 /* -------------------------------------------------------------------------------------------------
4277  * composite_over_n_8888_0565_ca
4278  */
4279
4280 static void
4281 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4282                                     pixman_op_t              op,
4283                                     pixman_image_t *         src_image,
4284                                     pixman_image_t *         mask_image,
4285                                     pixman_image_t *         dst_image,
4286                                     int32_t                  src_x,
4287                                     int32_t                  src_y,
4288                                     int32_t                  mask_x,
4289                                     int32_t                  mask_y,
4290                                     int32_t                  dest_x,
4291                                     int32_t                  dest_y,
4292                                     int32_t                  width,
4293                                     int32_t                  height)
4294 {
4295     uint32_t src;
4296     uint16_t    *dst_line, *dst, d;
4297     uint32_t    *mask_line, *mask, m;
4298     int dst_stride, mask_stride;
4299     int w;
4300     uint32_t pack_cmp;
4301
4302     __m128i xmm_src, xmm_alpha;
4303     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4304     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4305
4306     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4307
4308     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4309
4310     if (src == 0)
4311         return;
4312
4313     PIXMAN_IMAGE_GET_LINE (
4314         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4315     PIXMAN_IMAGE_GET_LINE (
4316         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4317
4318     xmm_src = expand_pixel_32_1x128 (src);
4319     xmm_alpha = expand_alpha_1x128 (xmm_src);
4320     mmx_src = _mm_movepi64_pi64 (xmm_src);
4321     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4322
4323     while (height--)
4324     {
4325         w = width;
4326         mask = mask_line;
4327         dst = dst_line;
4328         mask_line += mask_stride;
4329         dst_line += dst_stride;
4330
4331         while (w && ((unsigned long)dst & 15))
4332         {
4333             m = *(uint32_t *) mask;
4334
4335             if (m)
4336             {
4337                 d = *dst;
4338                 mmx_mask = unpack_32_1x64 (m);
4339                 mmx_dest = expand565_16_1x64 (d);
4340
4341                 *dst = pack_565_32_16 (
4342                     pack_1x64_32 (
4343                         in_over_1x64 (
4344                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4345             }
4346
4347             w--;
4348             dst++;
4349             mask++;
4350         }
4351
4352         while (w >= 8)
4353         {
4354             /* First round */
4355             xmm_mask = load_128_unaligned ((__m128i*)mask);
4356             xmm_dst = load_128_aligned ((__m128i*)dst);
4357
4358             pack_cmp = _mm_movemask_epi8 (
4359                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4360
4361             unpack_565_128_4x128 (xmm_dst,
4362                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4363             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4364
4365             /* preload next round */
4366             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4367
4368             /* preload next round */
4369             if (pack_cmp != 0xffff)
4370             {
4371                 in_over_2x128 (&xmm_src, &xmm_src,
4372                                &xmm_alpha, &xmm_alpha,
4373                                &xmm_mask_lo, &xmm_mask_hi,
4374                                &xmm_dst0, &xmm_dst1);
4375             }
4376
4377             /* Second round */
4378             pack_cmp = _mm_movemask_epi8 (
4379                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4380
4381             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4382
4383             if (pack_cmp != 0xffff)
4384             {
4385                 in_over_2x128 (&xmm_src, &xmm_src,
4386                                &xmm_alpha, &xmm_alpha,
4387                                &xmm_mask_lo, &xmm_mask_hi,
4388                                &xmm_dst2, &xmm_dst3);
4389             }
4390
4391             save_128_aligned (
4392                 (__m128i*)dst, pack_565_4x128_128 (
4393                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4394
4395             w -= 8;
4396             dst += 8;
4397             mask += 8;
4398         }
4399
4400         while (w)
4401         {
4402             m = *(uint32_t *) mask;
4403
4404             if (m)
4405             {
4406                 d = *dst;
4407                 mmx_mask = unpack_32_1x64 (m);
4408                 mmx_dest = expand565_16_1x64 (d);
4409
4410                 *dst = pack_565_32_16 (
4411                     pack_1x64_32 (
4412                         in_over_1x64 (
4413                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4414             }
4415
4416             w--;
4417             dst++;
4418             mask++;
4419         }
4420     }
4421
4422     _mm_empty ();
4423 }
4424
4425 /* -----------------------------------------------------------------------
4426  * composite_in_n_8_8
4427  */
4428
4429 static void
4430 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4431                          pixman_op_t              op,
4432                          pixman_image_t *         src_image,
4433                          pixman_image_t *         mask_image,
4434                          pixman_image_t *         dst_image,
4435                          int32_t                  src_x,
4436                          int32_t                  src_y,
4437                          int32_t                  mask_x,
4438                          int32_t                  mask_y,
4439                          int32_t                  dest_x,
4440                          int32_t                  dest_y,
4441                          int32_t                  width,
4442                          int32_t                  height)
4443 {
4444     uint8_t     *dst_line, *dst;
4445     uint8_t     *mask_line, *mask;
4446     int dst_stride, mask_stride;
4447     uint32_t d, m;
4448     uint32_t src;
4449     uint8_t sa;
4450     int32_t w;
4451
4452     __m128i xmm_alpha;
4453     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4454     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4455
4456     PIXMAN_IMAGE_GET_LINE (
4457         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4458     PIXMAN_IMAGE_GET_LINE (
4459         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4460
4461     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4462
4463     sa = src >> 24;
4464
4465     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4466
4467     while (height--)
4468     {
4469         dst = dst_line;
4470         dst_line += dst_stride;
4471         mask = mask_line;
4472         mask_line += mask_stride;
4473         w = width;
4474
4475         while (w && ((unsigned long)dst & 15))
4476         {
4477             m = (uint32_t) *mask++;
4478             d = (uint32_t) *dst;
4479
4480             *dst++ = (uint8_t) pack_1x64_32 (
4481                 pix_multiply_1x64 (
4482                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4483                                        unpack_32_1x64 (m)),
4484                     unpack_32_1x64 (d)));
4485             w--;
4486         }
4487
4488         while (w >= 16)
4489         {
4490             xmm_mask = load_128_unaligned ((__m128i*)mask);
4491             xmm_dst = load_128_aligned ((__m128i*)dst);
4492
4493             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4494             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4495
4496             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4497                                 &xmm_mask_lo, &xmm_mask_hi,
4498                                 &xmm_mask_lo, &xmm_mask_hi);
4499
4500             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4501                                 &xmm_dst_lo, &xmm_dst_hi,
4502                                 &xmm_dst_lo, &xmm_dst_hi);
4503
4504             save_128_aligned (
4505                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4506
4507             mask += 16;
4508             dst += 16;
4509             w -= 16;
4510         }
4511
4512         while (w)
4513         {
4514             m = (uint32_t) *mask++;
4515             d = (uint32_t) *dst;
4516
4517             *dst++ = (uint8_t) pack_1x64_32 (
4518                 pix_multiply_1x64 (
4519                     pix_multiply_1x64 (
4520                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4521                     unpack_32_1x64 (d)));
4522             w--;
4523         }
4524     }
4525
4526     _mm_empty ();
4527 }
4528
4529 /* -----------------------------------------------------------------------
4530  * composite_in_n_8
4531  */
4532
4533 static void
4534 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4535                        pixman_op_t              op,
4536                        pixman_image_t *         src_image,
4537                        pixman_image_t *         mask_image,
4538                        pixman_image_t *         dst_image,
4539                        int32_t                  src_x,
4540                        int32_t                  src_y,
4541                        int32_t                  mask_x,
4542                        int32_t                  mask_y,
4543                        int32_t                  dest_x,
4544                        int32_t                  dest_y,
4545                        int32_t                  width,
4546                        int32_t                  height)
4547 {
4548     uint8_t     *dst_line, *dst;
4549     int dst_stride;
4550     uint32_t d;
4551     uint32_t src;
4552     int32_t w;
4553
4554     __m128i xmm_alpha;
4555     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4556
4557     PIXMAN_IMAGE_GET_LINE (
4558         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4559
4560     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4561
4562     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4563
4564     src = src >> 24;
4565
4566     if (src == 0xff)
4567         return;
4568
4569     if (src == 0x00)
4570     {
4571         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4572                      8, dest_x, dest_y, width, height, src);
4573
4574         return;
4575     }
4576
4577     while (height--)
4578     {
4579         dst = dst_line;
4580         dst_line += dst_stride;
4581         w = width;
4582
4583         while (w && ((unsigned long)dst & 15))
4584         {
4585             d = (uint32_t) *dst;
4586
4587             *dst++ = (uint8_t) pack_1x64_32 (
4588                 pix_multiply_1x64 (
4589                     _mm_movepi64_pi64 (xmm_alpha),
4590                     unpack_32_1x64 (d)));
4591             w--;
4592         }
4593
4594         while (w >= 16)
4595         {
4596             xmm_dst = load_128_aligned ((__m128i*)dst);
4597
4598             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4599             
4600             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4601                                 &xmm_dst_lo, &xmm_dst_hi,
4602                                 &xmm_dst_lo, &xmm_dst_hi);
4603
4604             save_128_aligned (
4605                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4606
4607             dst += 16;
4608             w -= 16;
4609         }
4610
4611         while (w)
4612         {
4613             d = (uint32_t) *dst;
4614
4615             *dst++ = (uint8_t) pack_1x64_32 (
4616                 pix_multiply_1x64 (
4617                     _mm_movepi64_pi64 (xmm_alpha),
4618                     unpack_32_1x64 (d)));
4619             w--;
4620         }
4621     }
4622
4623     _mm_empty ();
4624 }
4625
4626 /* ---------------------------------------------------------------------------
4627  * composite_in_8_8
4628  */
4629
4630 static void
4631 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4632                        pixman_op_t              op,
4633                        pixman_image_t *         src_image,
4634                        pixman_image_t *         mask_image,
4635                        pixman_image_t *         dst_image,
4636                        int32_t                  src_x,
4637                        int32_t                  src_y,
4638                        int32_t                  mask_x,
4639                        int32_t                  mask_y,
4640                        int32_t                  dest_x,
4641                        int32_t                  dest_y,
4642                        int32_t                  width,
4643                        int32_t                  height)
4644 {
4645     uint8_t     *dst_line, *dst;
4646     uint8_t     *src_line, *src;
4647     int src_stride, dst_stride;
4648     int32_t w;
4649     uint32_t s, d;
4650
4651     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4652     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4653
4654     PIXMAN_IMAGE_GET_LINE (
4655         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4656     PIXMAN_IMAGE_GET_LINE (
4657         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4658
4659     while (height--)
4660     {
4661         dst = dst_line;
4662         dst_line += dst_stride;
4663         src = src_line;
4664         src_line += src_stride;
4665         w = width;
4666
4667         while (w && ((unsigned long)dst & 15))
4668         {
4669             s = (uint32_t) *src++;
4670             d = (uint32_t) *dst;
4671
4672             *dst++ = (uint8_t) pack_1x64_32 (
4673                 pix_multiply_1x64 (
4674                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4675             w--;
4676         }
4677
4678         while (w >= 16)
4679         {
4680             xmm_src = load_128_unaligned ((__m128i*)src);
4681             xmm_dst = load_128_aligned ((__m128i*)dst);
4682
4683             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4684             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4685
4686             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4687                                 &xmm_dst_lo, &xmm_dst_hi,
4688                                 &xmm_dst_lo, &xmm_dst_hi);
4689
4690             save_128_aligned (
4691                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4692
4693             src += 16;
4694             dst += 16;
4695             w -= 16;
4696         }
4697
4698         while (w)
4699         {
4700             s = (uint32_t) *src++;
4701             d = (uint32_t) *dst;
4702
4703             *dst++ = (uint8_t) pack_1x64_32 (
4704                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4705             w--;
4706         }
4707     }
4708
4709     _mm_empty ();
4710 }
4711
4712 /* -------------------------------------------------------------------------
4713  * composite_add_n_8_8
4714  */
4715
4716 static void
4717 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4718                           pixman_op_t              op,
4719                           pixman_image_t *         src_image,
4720                           pixman_image_t *         mask_image,
4721                           pixman_image_t *         dst_image,
4722                           int32_t                  src_x,
4723                           int32_t                  src_y,
4724                           int32_t                  mask_x,
4725                           int32_t                  mask_y,
4726                           int32_t                  dest_x,
4727                           int32_t                  dest_y,
4728                           int32_t                  width,
4729                           int32_t                  height)
4730 {
4731     uint8_t     *dst_line, *dst;
4732     uint8_t     *mask_line, *mask;
4733     int dst_stride, mask_stride;
4734     int32_t w;
4735     uint32_t src;
4736     uint8_t sa;
4737     uint32_t m, d;
4738
4739     __m128i xmm_alpha;
4740     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4741     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4742
4743     PIXMAN_IMAGE_GET_LINE (
4744         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4745     PIXMAN_IMAGE_GET_LINE (
4746         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4747
4748     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4749
4750     sa = src >> 24;
4751
4752     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4753
4754     while (height--)
4755     {
4756         dst = dst_line;
4757         dst_line += dst_stride;
4758         mask = mask_line;
4759         mask_line += mask_stride;
4760         w = width;
4761
4762         while (w && ((unsigned long)dst & 15))
4763         {
4764             m = (uint32_t) *mask++;
4765             d = (uint32_t) *dst;
4766
4767             *dst++ = (uint8_t) pack_1x64_32 (
4768                 _mm_adds_pu16 (
4769                     pix_multiply_1x64 (
4770                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4771                     unpack_32_1x64 (d)));
4772             w--;
4773         }
4774
4775         while (w >= 16)
4776         {
4777             xmm_mask = load_128_unaligned ((__m128i*)mask);
4778             xmm_dst = load_128_aligned ((__m128i*)dst);
4779
4780             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4781             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4782
4783             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4784                                 &xmm_mask_lo, &xmm_mask_hi,
4785                                 &xmm_mask_lo, &xmm_mask_hi);
4786
4787             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4788             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4789
4790             save_128_aligned (
4791                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4792
4793             mask += 16;
4794             dst += 16;
4795             w -= 16;
4796         }
4797
4798         while (w)
4799         {
4800             m = (uint32_t) *mask++;
4801             d = (uint32_t) *dst;
4802
4803             *dst++ = (uint8_t) pack_1x64_32 (
4804                 _mm_adds_pu16 (
4805                     pix_multiply_1x64 (
4806                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4807                     unpack_32_1x64 (d)));
4808
4809             w--;
4810         }
4811     }
4812
4813     _mm_empty ();
4814 }
4815
4816 /* -------------------------------------------------------------------------
4817  * composite_add_n_8_8
4818  */
4819
4820 static void
4821 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4822                         pixman_op_t              op,
4823                         pixman_image_t *         src_image,
4824                         pixman_image_t *         mask_image,
4825                         pixman_image_t *         dst_image,
4826                         int32_t                  src_x,
4827                         int32_t                  src_y,
4828                         int32_t                  mask_x,
4829                         int32_t                  mask_y,
4830                         int32_t                  dest_x,
4831                         int32_t                  dest_y,
4832                         int32_t                  width,
4833                         int32_t                  height)
4834 {
4835     uint8_t     *dst_line, *dst;
4836     int dst_stride;
4837     int32_t w;
4838     uint32_t src;
4839
4840     __m128i xmm_src;
4841
4842     PIXMAN_IMAGE_GET_LINE (
4843         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4844
4845     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4846
4847     src >>= 24;
4848
4849     if (src == 0x00)
4850         return;
4851
4852     if (src == 0xff)
4853     {
4854         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4855                      8, dest_x, dest_y, width, height, 0xff);
4856
4857         return;
4858     }
4859
4860     src = (src << 24) | (src << 16) | (src << 8) | src;
4861     xmm_src = _mm_set_epi32 (src, src, src, src);
4862
4863     while (height--)
4864     {
4865         dst = dst_line;
4866         dst_line += dst_stride;
4867         w = width;
4868
4869         while (w && ((unsigned long)dst & 15))
4870         {
4871             *dst = (uint8_t)_mm_cvtsi64_si32 (
4872                 _mm_adds_pu8 (
4873                     _mm_movepi64_pi64 (xmm_src),
4874                     _mm_cvtsi32_si64 (*dst)));
4875
4876             w--;
4877             dst++;
4878         }
4879
4880         while (w >= 16)
4881         {
4882             save_128_aligned (
4883                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4884
4885             dst += 16;
4886             w -= 16;
4887         }
4888
4889         while (w)
4890         {
4891             *dst = (uint8_t)_mm_cvtsi64_si32 (
4892                 _mm_adds_pu8 (
4893                     _mm_movepi64_pi64 (xmm_src),
4894                     _mm_cvtsi32_si64 (*dst)));
4895
4896             w--;
4897             dst++;
4898         }
4899     }
4900
4901     _mm_empty ();
4902 }
4903
4904 /* ----------------------------------------------------------------------
4905  * composite_add_8000_8000
4906  */
4907
4908 static void
4909 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
4910                               pixman_op_t              op,
4911                               pixman_image_t *         src_image,
4912                               pixman_image_t *         mask_image,
4913                               pixman_image_t *         dst_image,
4914                               int32_t                  src_x,
4915                               int32_t                  src_y,
4916                               int32_t                  mask_x,
4917                               int32_t                  mask_y,
4918                               int32_t                  dest_x,
4919                               int32_t                  dest_y,
4920                               int32_t                  width,
4921                               int32_t                  height)
4922 {
4923     uint8_t     *dst_line, *dst;
4924     uint8_t     *src_line, *src;
4925     int dst_stride, src_stride;
4926     int32_t w;
4927     uint16_t t;
4928
4929     PIXMAN_IMAGE_GET_LINE (
4930         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4931     PIXMAN_IMAGE_GET_LINE (
4932         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4933
4934     while (height--)
4935     {
4936         dst = dst_line;
4937         src = src_line;
4938
4939         dst_line += dst_stride;
4940         src_line += src_stride;
4941         w = width;
4942
4943         /* Small head */
4944         while (w && (unsigned long)dst & 3)
4945         {
4946             t = (*dst) + (*src++);
4947             *dst++ = t | (0 - (t >> 8));
4948             w--;
4949         }
4950
4951         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4952
4953         /* Small tail */
4954         dst += w & 0xfffc;
4955         src += w & 0xfffc;
4956
4957         w &= 3;
4958
4959         while (w)
4960         {
4961             t = (*dst) + (*src++);
4962             *dst++ = t | (0 - (t >> 8));
4963             w--;
4964         }
4965     }
4966
4967     _mm_empty ();
4968 }
4969
4970 /* ---------------------------------------------------------------------
4971  * composite_add_8888_8888
4972  */
4973 static void
4974 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4975                               pixman_op_t              op,
4976                               pixman_image_t *         src_image,
4977                               pixman_image_t *         mask_image,
4978                               pixman_image_t *         dst_image,
4979                               int32_t                  src_x,
4980                               int32_t                  src_y,
4981                               int32_t                  mask_x,
4982                               int32_t                  mask_y,
4983                               int32_t                  dest_x,
4984                               int32_t                  dest_y,
4985                               int32_t                  width,
4986                               int32_t                  height)
4987 {
4988     uint32_t    *dst_line, *dst;
4989     uint32_t    *src_line, *src;
4990     int dst_stride, src_stride;
4991
4992     PIXMAN_IMAGE_GET_LINE (
4993         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4994     PIXMAN_IMAGE_GET_LINE (
4995         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4996
4997     while (height--)
4998     {
4999         dst = dst_line;
5000         dst_line += dst_stride;
5001         src = src_line;
5002         src_line += src_stride;
5003
5004         core_combine_add_u_sse2 (dst, src, NULL, width);
5005     }
5006
5007     _mm_empty ();
5008 }
5009
5010 /* -------------------------------------------------------------------------------------------------
5011  * sse2_composite_copy_area
5012  */
5013
5014 static pixman_bool_t
5015 pixman_blt_sse2 (uint32_t *src_bits,
5016                  uint32_t *dst_bits,
5017                  int       src_stride,
5018                  int       dst_stride,
5019                  int       src_bpp,
5020                  int       dst_bpp,
5021                  int       src_x,
5022                  int       src_y,
5023                  int       dst_x,
5024                  int       dst_y,
5025                  int       width,
5026                  int       height)
5027 {
5028     uint8_t *   src_bytes;
5029     uint8_t *   dst_bytes;
5030     int byte_width;
5031
5032     if (src_bpp != dst_bpp)
5033         return FALSE;
5034
5035     if (src_bpp == 16)
5036     {
5037         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5038         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5039         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5040         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5041         byte_width = 2 * width;
5042         src_stride *= 2;
5043         dst_stride *= 2;
5044     }
5045     else if (src_bpp == 32)
5046     {
5047         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5048         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5049         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5050         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5051         byte_width = 4 * width;
5052         src_stride *= 4;
5053         dst_stride *= 4;
5054     }
5055     else
5056     {
5057         return FALSE;
5058     }
5059
5060     while (height--)
5061     {
5062         int w;
5063         uint8_t *s = src_bytes;
5064         uint8_t *d = dst_bytes;
5065         src_bytes += src_stride;
5066         dst_bytes += dst_stride;
5067         w = byte_width;
5068
5069         while (w >= 2 && ((unsigned long)d & 3))
5070         {
5071             *(uint16_t *)d = *(uint16_t *)s;
5072             w -= 2;
5073             s += 2;
5074             d += 2;
5075         }
5076
5077         while (w >= 4 && ((unsigned long)d & 15))
5078         {
5079             *(uint32_t *)d = *(uint32_t *)s;
5080
5081             w -= 4;
5082             s += 4;
5083             d += 4;
5084         }
5085
5086         while (w >= 64)
5087         {
5088             __m128i xmm0, xmm1, xmm2, xmm3;
5089
5090             xmm0 = load_128_unaligned ((__m128i*)(s));
5091             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5092             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5093             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5094
5095             save_128_aligned ((__m128i*)(d),    xmm0);
5096             save_128_aligned ((__m128i*)(d + 16), xmm1);
5097             save_128_aligned ((__m128i*)(d + 32), xmm2);
5098             save_128_aligned ((__m128i*)(d + 48), xmm3);
5099
5100             s += 64;
5101             d += 64;
5102             w -= 64;
5103         }
5104
5105         while (w >= 16)
5106         {
5107             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5108
5109             w -= 16;
5110             d += 16;
5111             s += 16;
5112         }
5113
5114         while (w >= 4)
5115         {
5116             *(uint32_t *)d = *(uint32_t *)s;
5117
5118             w -= 4;
5119             s += 4;
5120             d += 4;
5121         }
5122
5123         if (w >= 2)
5124         {
5125             *(uint16_t *)d = *(uint16_t *)s;
5126             w -= 2;
5127             s += 2;
5128             d += 2;
5129         }
5130     }
5131
5132     _mm_empty ();
5133
5134     return TRUE;
5135 }
5136
5137 static void
5138 sse2_composite_copy_area (pixman_implementation_t *imp,
5139                           pixman_op_t              op,
5140                           pixman_image_t *         src_image,
5141                           pixman_image_t *         mask_image,
5142                           pixman_image_t *         dst_image,
5143                           int32_t                  src_x,
5144                           int32_t                  src_y,
5145                           int32_t                  mask_x,
5146                           int32_t                  mask_y,
5147                           int32_t                  dest_x,
5148                           int32_t                  dest_y,
5149                           int32_t                  width,
5150                           int32_t                  height)
5151 {
5152     pixman_blt_sse2 (src_image->bits.bits,
5153                      dst_image->bits.bits,
5154                      src_image->bits.rowstride,
5155                      dst_image->bits.rowstride,
5156                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5157                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5158                      src_x, src_y, dest_x, dest_y, width, height);
5159 }
5160
5161 static void
5162 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5163                                  pixman_op_t              op,
5164                                  pixman_image_t *         src_image,
5165                                  pixman_image_t *         mask_image,
5166                                  pixman_image_t *         dst_image,
5167                                  int32_t                  src_x,
5168                                  int32_t                  src_y,
5169                                  int32_t                  mask_x,
5170                                  int32_t                  mask_y,
5171                                  int32_t                  dest_x,
5172                                  int32_t                  dest_y,
5173                                  int32_t                  width,
5174                                  int32_t                  height)
5175 {
5176     uint32_t    *src, *src_line, s;
5177     uint32_t    *dst, *dst_line, d;
5178     uint8_t         *mask, *mask_line;
5179     uint32_t m;
5180     int src_stride, mask_stride, dst_stride;
5181     int32_t w;
5182     __m64 ms;
5183
5184     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5185     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5186     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5187
5188     PIXMAN_IMAGE_GET_LINE (
5189         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5190     PIXMAN_IMAGE_GET_LINE (
5191         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5192     PIXMAN_IMAGE_GET_LINE (
5193         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5194
5195     while (height--)
5196     {
5197         src = src_line;
5198         src_line += src_stride;
5199         dst = dst_line;
5200         dst_line += dst_stride;
5201         mask = mask_line;
5202         mask_line += mask_stride;
5203
5204         w = width;
5205
5206         while (w && (unsigned long)dst & 15)
5207         {
5208             s = 0xff000000 | *src++;
5209             m = (uint32_t) *mask++;
5210             d = *dst;
5211             ms = unpack_32_1x64 (s);
5212
5213             if (m != 0xff)
5214             {
5215                 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5216                 __m64 md = unpack_32_1x64 (d);
5217
5218                 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5219             }
5220
5221             *dst++ = pack_1x64_32 (ms);
5222             w--;
5223         }
5224
5225         while (w >= 4)
5226         {
5227             m = *(uint32_t*) mask;
5228             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5229
5230             if (m == 0xffffffff)
5231             {
5232                 save_128_aligned ((__m128i*)dst, xmm_src);
5233             }
5234             else
5235             {
5236                 xmm_dst = load_128_aligned ((__m128i*)dst);
5237
5238                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5239
5240                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5241                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5242                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5243
5244                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5245
5246                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5247
5248                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5249             }
5250
5251             src += 4;
5252             dst += 4;
5253             mask += 4;
5254             w -= 4;
5255         }
5256
5257         while (w)
5258         {
5259             m = (uint32_t) *mask++;
5260
5261             if (m)
5262             {
5263                 s = 0xff000000 | *src;
5264
5265                 if (m == 0xff)
5266                 {
5267                     *dst = s;
5268                 }
5269                 else
5270                 {
5271                     __m64 ma, md, ms;
5272
5273                     d = *dst;
5274
5275                     ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5276                     md = unpack_32_1x64 (d);
5277                     ms = unpack_32_1x64 (s);
5278
5279                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5280                 }
5281
5282             }
5283
5284             src++;
5285             dst++;
5286             w--;
5287         }
5288     }
5289
5290     _mm_empty ();
5291 }
5292
5293 static void
5294 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5295                                  pixman_op_t              op,
5296                                  pixman_image_t *         src_image,
5297                                  pixman_image_t *         mask_image,
5298                                  pixman_image_t *         dst_image,
5299                                  int32_t                  src_x,
5300                                  int32_t                  src_y,
5301                                  int32_t                  mask_x,
5302                                  int32_t                  mask_y,
5303                                  int32_t                  dest_x,
5304                                  int32_t                  dest_y,
5305                                  int32_t                  width,
5306                                  int32_t                  height)
5307 {
5308     uint32_t    *src, *src_line, s;
5309     uint32_t    *dst, *dst_line, d;
5310     uint8_t         *mask, *mask_line;
5311     uint32_t m;
5312     int src_stride, mask_stride, dst_stride;
5313     int32_t w;
5314
5315     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5316     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5317     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5318
5319     PIXMAN_IMAGE_GET_LINE (
5320         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5321     PIXMAN_IMAGE_GET_LINE (
5322         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5323     PIXMAN_IMAGE_GET_LINE (
5324         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5325
5326     while (height--)
5327     {
5328         src = src_line;
5329         src_line += src_stride;
5330         dst = dst_line;
5331         dst_line += dst_stride;
5332         mask = mask_line;
5333         mask_line += mask_stride;
5334
5335         w = width;
5336
5337         while (w && (unsigned long)dst & 15)
5338         {
5339             uint32_t sa;
5340
5341             s = *src++;
5342             m = (uint32_t) *mask++;
5343             d = *dst;
5344
5345             sa = s >> 24;
5346
5347             if (m)
5348             {
5349                 if (sa == 0xff && m == 0xff)
5350                 {
5351                     *dst = s;
5352                 }
5353                 else
5354                 {
5355                     __m64 ms, md, ma, msa;
5356
5357                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5358                     ms = unpack_32_1x64 (s);
5359                     md = unpack_32_1x64 (d);
5360
5361                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5362
5363                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5364                 }
5365             }
5366
5367             dst++;
5368             w--;
5369         }
5370
5371         while (w >= 4)
5372         {
5373             m = *(uint32_t *) mask;
5374
5375             if (m)
5376             {
5377                 xmm_src = load_128_unaligned ((__m128i*)src);
5378
5379                 if (m == 0xffffffff && is_opaque (xmm_src))
5380                 {
5381                     save_128_aligned ((__m128i *)dst, xmm_src);
5382                 }
5383                 else
5384                 {
5385                     xmm_dst = load_128_aligned ((__m128i *)dst);
5386
5387                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5388
5389                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5390                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5391                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5392
5393                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5394                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5395
5396                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5397                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5398
5399                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5400                 }
5401             }
5402
5403             src += 4;
5404             dst += 4;
5405             mask += 4;
5406             w -= 4;
5407         }
5408
5409         while (w)
5410         {
5411             uint32_t sa;
5412
5413             s = *src++;
5414             m = (uint32_t) *mask++;
5415             d = *dst;
5416
5417             sa = s >> 24;
5418
5419             if (m)
5420             {
5421                 if (sa == 0xff && m == 0xff)
5422                 {
5423                     *dst = s;
5424                 }
5425                 else
5426                 {
5427                     __m64 ms, md, ma, msa;
5428
5429                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5430                     ms = unpack_32_1x64 (s);
5431                     md = unpack_32_1x64 (d);
5432
5433                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5434
5435                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5436                 }
5437             }
5438
5439             dst++;
5440             w--;
5441         }
5442     }
5443
5444     _mm_empty ();
5445 }
5446
5447 static void
5448 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5449                                     pixman_op_t              op,
5450                                     pixman_image_t *         src_image,
5451                                     pixman_image_t *         mask_image,
5452                                     pixman_image_t *         dst_image,
5453                                     int32_t                  src_x,
5454                                     int32_t                  src_y,
5455                                     int32_t                  mask_x,
5456                                     int32_t                  mask_y,
5457                                     int32_t                  dest_x,
5458                                     int32_t                  dest_y,
5459                                     int32_t                  width,
5460                                     int32_t                  height)
5461 {
5462     uint32_t src;
5463     uint32_t    *dst_line, *dst;
5464     __m128i xmm_src;
5465     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5466     __m128i xmm_dsta_hi, xmm_dsta_lo;
5467     int dst_stride;
5468     int32_t w;
5469
5470     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5471
5472     if (src == 0)
5473         return;
5474
5475     PIXMAN_IMAGE_GET_LINE (
5476         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5477
5478     xmm_src = expand_pixel_32_1x128 (src);
5479
5480     while (height--)
5481     {
5482         dst = dst_line;
5483
5484         dst_line += dst_stride;
5485         w = width;
5486
5487         while (w && (unsigned long)dst & 15)
5488         {
5489             __m64 vd;
5490
5491             vd = unpack_32_1x64 (*dst);
5492
5493             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5494                                             _mm_movepi64_pi64 (xmm_src)));
5495             w--;
5496             dst++;
5497         }
5498
5499         while (w >= 4)
5500         {
5501             __m128i tmp_lo, tmp_hi;
5502
5503             xmm_dst = load_128_aligned ((__m128i*)dst);
5504
5505             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5506             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5507
5508             tmp_lo = xmm_src;
5509             tmp_hi = xmm_src;
5510
5511             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5512                         &xmm_dsta_lo, &xmm_dsta_hi,
5513                         &tmp_lo, &tmp_hi);
5514
5515             save_128_aligned (
5516                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5517
5518             w -= 4;
5519             dst += 4;
5520         }
5521
5522         while (w)
5523         {
5524             __m64 vd;
5525
5526             vd = unpack_32_1x64 (*dst);
5527
5528             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5529                                             _mm_movepi64_pi64 (xmm_src)));
5530             w--;
5531             dst++;
5532         }
5533
5534     }
5535
5536     _mm_empty ();
5537 }
5538
5539 static void
5540 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5541                                     pixman_op_t              op,
5542                                     pixman_image_t *         src_image,
5543                                     pixman_image_t *         mask_image,
5544                                     pixman_image_t *         dst_image,
5545                                     int32_t                  src_x,
5546                                     int32_t                  src_y,
5547                                     int32_t                  mask_x,
5548                                     int32_t                  mask_y,
5549                                     int32_t                  dest_x,
5550                                     int32_t                  dest_y,
5551                                     int32_t                  width,
5552                                     int32_t                  height)
5553 {
5554     uint32_t    *src, *src_line, s;
5555     uint32_t    *dst, *dst_line, d;
5556     uint32_t    *mask, *mask_line;
5557     uint32_t    m;
5558     int src_stride, mask_stride, dst_stride;
5559     int32_t w;
5560
5561     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5562     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5563     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5564
5565     PIXMAN_IMAGE_GET_LINE (
5566         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5567     PIXMAN_IMAGE_GET_LINE (
5568         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5569     PIXMAN_IMAGE_GET_LINE (
5570         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5571
5572     while (height--)
5573     {
5574         src = src_line;
5575         src_line += src_stride;
5576         dst = dst_line;
5577         dst_line += dst_stride;
5578         mask = mask_line;
5579         mask_line += mask_stride;
5580
5581         w = width;
5582
5583         while (w && (unsigned long)dst & 15)
5584         {
5585             uint32_t sa;
5586
5587             s = *src++;
5588             m = (*mask++) >> 24;
5589             d = *dst;
5590
5591             sa = s >> 24;
5592
5593             if (m)
5594             {
5595                 if (sa == 0xff && m == 0xff)
5596                 {
5597                     *dst = s;
5598                 }
5599                 else
5600                 {
5601                     __m64 ms, md, ma, msa;
5602
5603                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5604                     ms = unpack_32_1x64 (s);
5605                     md = unpack_32_1x64 (d);
5606
5607                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5608
5609                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5610                 }
5611             }
5612
5613             dst++;
5614             w--;
5615         }
5616
5617         while (w >= 4)
5618         {
5619             xmm_mask = load_128_unaligned ((__m128i*)mask);
5620
5621             if (!is_transparent (xmm_mask))
5622             {
5623                 xmm_src = load_128_unaligned ((__m128i*)src);
5624
5625                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5626                 {
5627                     save_128_aligned ((__m128i *)dst, xmm_src);
5628                 }
5629                 else
5630                 {
5631                     xmm_dst = load_128_aligned ((__m128i *)dst);
5632
5633                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5634                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5635                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5636
5637                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5638                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5639
5640                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5641                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5642
5643                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5644                 }
5645             }
5646
5647             src += 4;
5648             dst += 4;
5649             mask += 4;
5650             w -= 4;
5651         }
5652
5653         while (w)
5654         {
5655             uint32_t sa;
5656
5657             s = *src++;
5658             m = (*mask++) >> 24;
5659             d = *dst;
5660
5661             sa = s >> 24;
5662
5663             if (m)
5664             {
5665                 if (sa == 0xff && m == 0xff)
5666                 {
5667                     *dst = s;
5668                 }
5669                 else
5670                 {
5671                     __m64 ms, md, ma, msa;
5672
5673                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5674                     ms = unpack_32_1x64 (s);
5675                     md = unpack_32_1x64 (d);
5676
5677                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5678
5679                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5680                 }
5681             }
5682
5683             dst++;
5684             w--;
5685         }
5686     }
5687
5688     _mm_empty ();
5689 }
5690
5691 /* A variant of 'core_combine_over_u_sse2' with minor tweaks */
5692 static force_inline void
5693 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5694                                              const uint32_t* ps,
5695                                              int32_t         w,
5696                                              pixman_fixed_t  vx,
5697                                              pixman_fixed_t  unit_x,
5698                                              pixman_fixed_t  max_vx)
5699 {
5700     uint32_t s, d;
5701     const uint32_t* pm = NULL;
5702
5703     __m128i xmm_dst_lo, xmm_dst_hi;
5704     __m128i xmm_src_lo, xmm_src_hi;
5705     __m128i xmm_alpha_lo, xmm_alpha_hi;
5706
5707     /* Align dst on a 16-byte boundary */
5708     while (w && ((unsigned long)pd & 15))
5709     {
5710         d = *pd;
5711         s = combine1 (ps + (vx >> 16), pm);
5712         vx += unit_x;
5713
5714         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5715         if (pm)
5716             pm++;
5717         w--;
5718     }
5719
5720     while (w >= 4)
5721     {
5722         __m128i tmp;
5723         uint32_t tmp1, tmp2, tmp3, tmp4;
5724
5725         tmp1 = ps[vx >> 16];
5726         vx += unit_x;
5727         tmp2 = ps[vx >> 16];
5728         vx += unit_x;
5729         tmp3 = ps[vx >> 16];
5730         vx += unit_x;
5731         tmp4 = ps[vx >> 16];
5732         vx += unit_x;
5733
5734         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5735
5736         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5737
5738         if (is_opaque (xmm_src_hi))
5739         {
5740             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5741         }
5742         else if (!is_zero (xmm_src_hi))
5743         {
5744             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5745
5746             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5747             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5748
5749             expand_alpha_2x128 (
5750                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5751
5752             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5753                         &xmm_alpha_lo, &xmm_alpha_hi,
5754                         &xmm_dst_lo, &xmm_dst_hi);
5755
5756             /* rebuid the 4 pixel data and save*/
5757             save_128_aligned ((__m128i*)pd,
5758                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5759         }
5760
5761         w -= 4;
5762         pd += 4;
5763         if (pm)
5764             pm += 4;
5765     }
5766
5767     while (w)
5768     {
5769         d = *pd;
5770         s = combine1 (ps + (vx >> 16), pm);
5771         vx += unit_x;
5772
5773         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5774         if (pm)
5775             pm++;
5776
5777         w--;
5778     }
5779     _mm_empty ();
5780 }
5781
5782 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5783                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5784                        uint32_t, uint32_t, COVER);
5785 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5786                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5787                        uint32_t, uint32_t, NONE);
5788 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5789                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5790                        uint32_t, uint32_t, PAD);
5791
5792 static const pixman_fast_path_t sse2_fast_paths[] =
5793 {
5794     /* PIXMAN_OP_OVER */
5795     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5796     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5797     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5798     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5799     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5800     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5801     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5802     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5803     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5804     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5805     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5806     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5807     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5808     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5809     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5810     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5811     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5812     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5813     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5814     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5815     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5816     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5817     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5818     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5819     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5820     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5821     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5822     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5823     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5824     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5825     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5826     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5827     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5828     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5829     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5830     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5831     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5832     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5833     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5834     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5835     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5836     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5837     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5838     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5839     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5840     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5841     
5842     /* PIXMAN_OP_OVER_REVERSE */
5843     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5844     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5845
5846     /* PIXMAN_OP_ADD */
5847     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5848     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
5849     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5850     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5851     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5852     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5853
5854     /* PIXMAN_OP_SRC */
5855     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5856     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5857     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5858     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5859     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5860     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5861     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5862     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5863     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5864     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5865     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5866     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5867     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5868     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5869
5870     /* PIXMAN_OP_IN */
5871     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5872     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5873     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5874
5875     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5876     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5877     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5878     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5879     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5880     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5881     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5882     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5883     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5884     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5885     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5886     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5887
5888     { PIXMAN_OP_NONE },
5889 };
5890
5891 static pixman_bool_t
5892 sse2_blt (pixman_implementation_t *imp,
5893           uint32_t *               src_bits,
5894           uint32_t *               dst_bits,
5895           int                      src_stride,
5896           int                      dst_stride,
5897           int                      src_bpp,
5898           int                      dst_bpp,
5899           int                      src_x,
5900           int                      src_y,
5901           int                      dst_x,
5902           int                      dst_y,
5903           int                      width,
5904           int                      height)
5905 {
5906     if (!pixman_blt_sse2 (
5907             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5908             src_x, src_y, dst_x, dst_y, width, height))
5909
5910     {
5911         return _pixman_implementation_blt (
5912             imp->delegate,
5913             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5914             src_x, src_y, dst_x, dst_y, width, height);
5915     }
5916
5917     return TRUE;
5918 }
5919
5920 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5921 __attribute__((__force_align_arg_pointer__))
5922 #endif
5923 static pixman_bool_t
5924 sse2_fill (pixman_implementation_t *imp,
5925            uint32_t *               bits,
5926            int                      stride,
5927            int                      bpp,
5928            int                      x,
5929            int                      y,
5930            int                      width,
5931            int                      height,
5932            uint32_t xor)
5933 {
5934     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5935     {
5936         return _pixman_implementation_fill (
5937             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5938     }
5939
5940     return TRUE;
5941 }
5942
5943 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5944 __attribute__((__force_align_arg_pointer__))
5945 #endif
5946 pixman_implementation_t *
5947 _pixman_implementation_create_sse2 (void)
5948 {
5949 #ifdef USE_MMX
5950     pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
5951 #else
5952     pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
5953 #endif
5954     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
5955
5956     /* SSE2 constants */
5957     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5958     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5959     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5960     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5961     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5962     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5963     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5964     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5965     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
5966     mask_0080 = create_mask_16_128 (0x0080);
5967     mask_00ff = create_mask_16_128 (0x00ff);
5968     mask_0101 = create_mask_16_128 (0x0101);
5969     mask_ffff = create_mask_16_128 (0xffff);
5970     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5971     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5972
5973     /* MMX constants */
5974     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5975     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5976
5977     mask_x0080 = create_mask_16_64 (0x0080);
5978     mask_x00ff = create_mask_16_64 (0x00ff);
5979     mask_x0101 = create_mask_16_64 (0x0101);
5980     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5981
5982     _mm_empty ();
5983
5984     /* Set up function pointers */
5985
5986     /* SSE code patch for fbcompose.c */
5987     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5988     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5989     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5990     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5991     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5992     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5993     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5994     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5995     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5996     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5997
5998     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5999
6000     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6001     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6002     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6003     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6004     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6005     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6006     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6007     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6008     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6009     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6010     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6011
6012     imp->blt = sse2_blt;
6013     imp->fill = sse2_fill;
6014
6015     return imp;
6016 }
6017
6018 #endif /* USE_SSE2 */