Replace instances of "dst_*" with "dest_*"
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
34 #include <emmintrin.h> /* for SSE2 intrinsics */
35 #include "pixman-private.h"
36 #include "pixman-combine32.h"
37 #include "pixman-fast-path.h"
38
39 static __m128i mask_0080;
40 static __m128i mask_00ff;
41 static __m128i mask_0101;
42 static __m128i mask_ffff;
43 static __m128i mask_ff000000;
44 static __m128i mask_alpha;
45
46 static __m128i mask_565_r;
47 static __m128i mask_565_g1, mask_565_g2;
48 static __m128i mask_565_b;
49 static __m128i mask_red;
50 static __m128i mask_green;
51 static __m128i mask_blue;
52
53 static __m128i mask_565_fix_rb;
54 static __m128i mask_565_fix_g;
55
56 static force_inline __m128i
57 unpack_32_1x128 (uint32_t data)
58 {
59     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
60 }
61
62 static force_inline void
63 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
64 {
65     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
66     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
67 }
68
69 static force_inline __m128i
70 unpack_565_to_8888 (__m128i lo)
71 {
72     __m128i r, g, b, rb, t;
73
74     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
75     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
76     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
77
78     rb = _mm_or_si128 (r, b);
79     t  = _mm_and_si128 (rb, mask_565_fix_rb);
80     t  = _mm_srli_epi32 (t, 5);
81     rb = _mm_or_si128 (rb, t);
82
83     t  = _mm_and_si128 (g, mask_565_fix_g);
84     t  = _mm_srli_epi32 (t, 6);
85     g  = _mm_or_si128 (g, t);
86
87     return _mm_or_si128 (rb, g);
88 }
89
90 static force_inline void
91 unpack_565_128_4x128 (__m128i  data,
92                       __m128i* data0,
93                       __m128i* data1,
94                       __m128i* data2,
95                       __m128i* data3)
96 {
97     __m128i lo, hi;
98
99     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
100     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
101
102     lo = unpack_565_to_8888 (lo);
103     hi = unpack_565_to_8888 (hi);
104
105     unpack_128_2x128 (lo, data0, data1);
106     unpack_128_2x128 (hi, data2, data3);
107 }
108
109 static force_inline uint16_t
110 pack_565_32_16 (uint32_t pixel)
111 {
112     return (uint16_t) (((pixel >> 8) & 0xf800) |
113                        ((pixel >> 5) & 0x07e0) |
114                        ((pixel >> 3) & 0x001f));
115 }
116
117 static force_inline __m128i
118 pack_2x128_128 (__m128i lo, __m128i hi)
119 {
120     return _mm_packus_epi16 (lo, hi);
121 }
122
123 static force_inline __m128i
124 pack_565_2x128_128 (__m128i lo, __m128i hi)
125 {
126     __m128i data;
127     __m128i r, g1, g2, b;
128
129     data = pack_2x128_128 (lo, hi);
130
131     r  = _mm_and_si128 (data, mask_565_r);
132     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
133     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
134     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
135
136     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
137 }
138
139 static force_inline __m128i
140 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
141 {
142     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
143                              pack_565_2x128_128 (*xmm2, *xmm3));
144 }
145
146 static force_inline int
147 is_opaque (__m128i x)
148 {
149     __m128i ffs = _mm_cmpeq_epi8 (x, x);
150
151     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
152 }
153
154 static force_inline int
155 is_zero (__m128i x)
156 {
157     return _mm_movemask_epi8 (
158         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
159 }
160
161 static force_inline int
162 is_transparent (__m128i x)
163 {
164     return (_mm_movemask_epi8 (
165                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
166 }
167
168 static force_inline __m128i
169 expand_pixel_32_1x128 (uint32_t data)
170 {
171     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
172 }
173
174 static force_inline __m128i
175 expand_alpha_1x128 (__m128i data)
176 {
177     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
178                                                      _MM_SHUFFLE (3, 3, 3, 3)),
179                                 _MM_SHUFFLE (3, 3, 3, 3));
180 }
181
182 static force_inline void
183 expand_alpha_2x128 (__m128i  data_lo,
184                     __m128i  data_hi,
185                     __m128i* alpha_lo,
186                     __m128i* alpha_hi)
187 {
188     __m128i lo, hi;
189
190     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
191     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
192
193     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
194     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
195 }
196
197 static force_inline void
198 expand_alpha_rev_2x128 (__m128i  data_lo,
199                         __m128i  data_hi,
200                         __m128i* alpha_lo,
201                         __m128i* alpha_hi)
202 {
203     __m128i lo, hi;
204
205     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
206     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
207     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
208     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
209 }
210
211 static force_inline void
212 pix_multiply_2x128 (__m128i* data_lo,
213                     __m128i* data_hi,
214                     __m128i* alpha_lo,
215                     __m128i* alpha_hi,
216                     __m128i* ret_lo,
217                     __m128i* ret_hi)
218 {
219     __m128i lo, hi;
220
221     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
222     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
223     lo = _mm_adds_epu16 (lo, mask_0080);
224     hi = _mm_adds_epu16 (hi, mask_0080);
225     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
226     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
227 }
228
229 static force_inline void
230 pix_add_multiply_2x128 (__m128i* src_lo,
231                         __m128i* src_hi,
232                         __m128i* alpha_dst_lo,
233                         __m128i* alpha_dst_hi,
234                         __m128i* dst_lo,
235                         __m128i* dst_hi,
236                         __m128i* alpha_src_lo,
237                         __m128i* alpha_src_hi,
238                         __m128i* ret_lo,
239                         __m128i* ret_hi)
240 {
241     __m128i t1_lo, t1_hi;
242     __m128i t2_lo, t2_hi;
243
244     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
245     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
246
247     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
248     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
249 }
250
251 static force_inline void
252 negate_2x128 (__m128i  data_lo,
253               __m128i  data_hi,
254               __m128i* neg_lo,
255               __m128i* neg_hi)
256 {
257     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
258     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
259 }
260
261 static force_inline void
262 invert_colors_2x128 (__m128i  data_lo,
263                      __m128i  data_hi,
264                      __m128i* inv_lo,
265                      __m128i* inv_hi)
266 {
267     __m128i lo, hi;
268
269     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
270     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
271     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
272     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
273 }
274
275 static force_inline void
276 over_2x128 (__m128i* src_lo,
277             __m128i* src_hi,
278             __m128i* alpha_lo,
279             __m128i* alpha_hi,
280             __m128i* dst_lo,
281             __m128i* dst_hi)
282 {
283     __m128i t1, t2;
284
285     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
286
287     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
288
289     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
290     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
291 }
292
293 static force_inline void
294 over_rev_non_pre_2x128 (__m128i  src_lo,
295                         __m128i  src_hi,
296                         __m128i* dst_lo,
297                         __m128i* dst_hi)
298 {
299     __m128i lo, hi;
300     __m128i alpha_lo, alpha_hi;
301
302     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
303
304     lo = _mm_or_si128 (alpha_lo, mask_alpha);
305     hi = _mm_or_si128 (alpha_hi, mask_alpha);
306
307     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
308
309     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
310
311     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
312 }
313
314 static force_inline void
315 in_over_2x128 (__m128i* src_lo,
316                __m128i* src_hi,
317                __m128i* alpha_lo,
318                __m128i* alpha_hi,
319                __m128i* mask_lo,
320                __m128i* mask_hi,
321                __m128i* dst_lo,
322                __m128i* dst_hi)
323 {
324     __m128i s_lo, s_hi;
325     __m128i a_lo, a_hi;
326
327     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
328     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
329
330     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
331 }
332
333 /* load 4 pixels from a 16-byte boundary aligned address */
334 static force_inline __m128i
335 load_128_aligned (__m128i* src)
336 {
337     return _mm_load_si128 (src);
338 }
339
340 /* load 4 pixels from a unaligned address */
341 static force_inline __m128i
342 load_128_unaligned (const __m128i* src)
343 {
344     return _mm_loadu_si128 (src);
345 }
346
347 /* save 4 pixels using Write Combining memory on a 16-byte
348  * boundary aligned address
349  */
350 static force_inline void
351 save_128_write_combining (__m128i* dst,
352                           __m128i  data)
353 {
354     _mm_stream_si128 (dst, data);
355 }
356
357 /* save 4 pixels on a 16-byte boundary aligned address */
358 static force_inline void
359 save_128_aligned (__m128i* dst,
360                   __m128i  data)
361 {
362     _mm_store_si128 (dst, data);
363 }
364
365 /* save 4 pixels on a unaligned address */
366 static force_inline void
367 save_128_unaligned (__m128i* dst,
368                     __m128i  data)
369 {
370     _mm_storeu_si128 (dst, data);
371 }
372
373 static force_inline __m128i
374 load_32_1x128 (uint32_t data)
375 {
376     return _mm_cvtsi32_si128 (data);
377 }
378
379 static force_inline __m128i
380 expand_alpha_rev_1x128 (__m128i data)
381 {
382     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
383 }
384
385 static force_inline __m128i
386 expand_pixel_8_1x128 (uint8_t data)
387 {
388     return _mm_shufflelo_epi16 (
389         unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
390 }
391
392 static force_inline __m128i
393 pix_multiply_1x128 (__m128i data,
394                     __m128i alpha)
395 {
396     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
397                                             mask_0080),
398                             mask_0101);
399 }
400
401 static force_inline __m128i
402 pix_add_multiply_1x128 (__m128i* src,
403                         __m128i* alpha_dst,
404                         __m128i* dst,
405                         __m128i* alpha_src)
406 {
407     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
408     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
409
410     return _mm_adds_epu8 (t1, t2);
411 }
412
413 static force_inline __m128i
414 negate_1x128 (__m128i data)
415 {
416     return _mm_xor_si128 (data, mask_00ff);
417 }
418
419 static force_inline __m128i
420 invert_colors_1x128 (__m128i data)
421 {
422     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
423 }
424
425 static force_inline __m128i
426 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
427 {
428     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
429 }
430
431 static force_inline __m128i
432 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
433 {
434     return over_1x128 (pix_multiply_1x128 (*src, *mask),
435                        pix_multiply_1x128 (*alpha, *mask),
436                        *dst);
437 }
438
439 static force_inline __m128i
440 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
441 {
442     __m128i alpha = expand_alpha_1x128 (src);
443
444     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
445                                            _mm_or_si128 (alpha, mask_alpha)),
446                        alpha,
447                        dst);
448 }
449
450 static force_inline uint32_t
451 pack_1x128_32 (__m128i data)
452 {
453     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
454 }
455
456 static force_inline __m128i
457 expand565_16_1x128 (uint16_t pixel)
458 {
459     __m128i m = _mm_cvtsi32_si128 (pixel);
460
461     m = unpack_565_to_8888 (m);
462
463     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
464 }
465
466 static force_inline uint32_t
467 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
468 {
469     uint8_t a;
470     __m128i xmms;
471
472     a = src >> 24;
473
474     if (a == 0xff)
475     {
476         return src;
477     }
478     else if (src)
479     {
480         xmms = unpack_32_1x128 (src);
481         return pack_1x128_32 (
482             over_1x128 (xmms, expand_alpha_1x128 (xmms),
483                         unpack_32_1x128 (dst)));
484     }
485
486     return dst;
487 }
488
489 static force_inline uint32_t
490 combine1 (const uint32_t *ps, const uint32_t *pm)
491 {
492     uint32_t s = *ps;
493
494     if (pm)
495     {
496         __m128i ms, mm;
497
498         mm = unpack_32_1x128 (*pm);
499         mm = expand_alpha_1x128 (mm);
500
501         ms = unpack_32_1x128 (s);
502         ms = pix_multiply_1x128 (ms, mm);
503
504         s = pack_1x128_32 (ms);
505     }
506
507     return s;
508 }
509
510 static force_inline __m128i
511 combine4 (const __m128i *ps, const __m128i *pm)
512 {
513     __m128i xmm_src_lo, xmm_src_hi;
514     __m128i xmm_msk_lo, xmm_msk_hi;
515     __m128i s;
516
517     if (pm)
518     {
519         xmm_msk_lo = load_128_unaligned (pm);
520
521         if (is_transparent (xmm_msk_lo))
522             return _mm_setzero_si128 ();
523     }
524
525     s = load_128_unaligned (ps);
526
527     if (pm)
528     {
529         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
530         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
531
532         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
533
534         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
535                             &xmm_msk_lo, &xmm_msk_hi,
536                             &xmm_src_lo, &xmm_src_hi);
537
538         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
539     }
540
541     return s;
542 }
543
544 static force_inline void
545 core_combine_over_u_sse2_mask (uint32_t *         pd,
546                                const uint32_t*    ps,
547                                const uint32_t*    pm,
548                                int                w)
549 {
550     uint32_t s, d;
551
552     /* Align dst on a 16-byte boundary */
553     while (w && ((unsigned long)pd & 15))
554     {
555         d = *pd;
556         s = combine1 (ps, pm);
557
558         if (s)
559             *pd = core_combine_over_u_pixel_sse2 (s, d);
560         pd++;
561         ps++;
562         pm++;
563         w--;
564     }
565
566     while (w >= 4)
567     {
568         __m128i mask = load_128_unaligned ((__m128i *)pm);
569
570         if (!is_zero (mask))
571         {
572             __m128i src;
573             __m128i src_hi, src_lo;
574             __m128i mask_hi, mask_lo;
575             __m128i alpha_hi, alpha_lo;
576
577             src = load_128_unaligned ((__m128i *)ps);
578
579             if (is_opaque (_mm_and_si128 (src, mask)))
580             {
581                 save_128_aligned ((__m128i *)pd, src);
582             }
583             else
584             {
585                 __m128i dst = load_128_aligned ((__m128i *)pd);
586                 __m128i dst_hi, dst_lo;
587
588                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
589                 unpack_128_2x128 (src, &src_lo, &src_hi);
590
591                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
592                 pix_multiply_2x128 (&src_lo, &src_hi,
593                                     &mask_lo, &mask_hi,
594                                     &src_lo, &src_hi);
595
596                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
597
598                 expand_alpha_2x128 (src_lo, src_hi,
599                                     &alpha_lo, &alpha_hi);
600
601                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
602                             &dst_lo, &dst_hi);
603
604                 save_128_aligned (
605                     (__m128i *)pd,
606                     pack_2x128_128 (dst_lo, dst_hi));
607             }
608         }
609
610         pm += 4;
611         ps += 4;
612         pd += 4;
613         w -= 4;
614     }
615     while (w)
616     {
617         d = *pd;
618         s = combine1 (ps, pm);
619
620         if (s)
621             *pd = core_combine_over_u_pixel_sse2 (s, d);
622         pd++;
623         ps++;
624         pm++;
625
626         w--;
627     }
628 }
629
630 static force_inline void
631 core_combine_over_u_sse2_no_mask (uint32_t *      pd,
632                                   const uint32_t*    ps,
633                                   int                w)
634 {
635     uint32_t s, d;
636
637     /* Align dst on a 16-byte boundary */
638     while (w && ((unsigned long)pd & 15))
639     {
640         d = *pd;
641         s = *ps;
642
643         if (s)
644             *pd = core_combine_over_u_pixel_sse2 (s, d);
645         pd++;
646         ps++;
647         w--;
648     }
649
650     while (w >= 4)
651     {
652         __m128i src;
653         __m128i src_hi, src_lo, dst_hi, dst_lo;
654         __m128i alpha_hi, alpha_lo;
655
656         src = load_128_unaligned ((__m128i *)ps);
657
658         if (!is_zero (src))
659         {
660             if (is_opaque (src))
661             {
662                 save_128_aligned ((__m128i *)pd, src);
663             }
664             else
665             {
666                 __m128i dst = load_128_aligned ((__m128i *)pd);
667
668                 unpack_128_2x128 (src, &src_lo, &src_hi);
669                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
670
671                 expand_alpha_2x128 (src_lo, src_hi,
672                                     &alpha_lo, &alpha_hi);
673                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
674                             &dst_lo, &dst_hi);
675
676                 save_128_aligned (
677                     (__m128i *)pd,
678                     pack_2x128_128 (dst_lo, dst_hi));
679             }
680         }
681
682         ps += 4;
683         pd += 4;
684         w -= 4;
685     }
686     while (w)
687     {
688         d = *pd;
689         s = *ps;
690
691         if (s)
692             *pd = core_combine_over_u_pixel_sse2 (s, d);
693         pd++;
694         ps++;
695
696         w--;
697     }
698 }
699
700 static force_inline void
701 sse2_combine_over_u (pixman_implementation_t *imp,
702                      pixman_op_t              op,
703                      uint32_t *               pd,
704                      const uint32_t *         ps,
705                      const uint32_t *         pm,
706                      int                      w)
707 {
708     if (pm)
709         core_combine_over_u_sse2_mask (pd, ps, pm, w);
710     else
711         core_combine_over_u_sse2_no_mask (pd, ps, w);
712 }
713
714 static void
715 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
716                              pixman_op_t              op,
717                              uint32_t *               pd,
718                              const uint32_t *         ps,
719                              const uint32_t *         pm,
720                              int                      w)
721 {
722     uint32_t s, d;
723
724     __m128i xmm_dst_lo, xmm_dst_hi;
725     __m128i xmm_src_lo, xmm_src_hi;
726     __m128i xmm_alpha_lo, xmm_alpha_hi;
727
728     /* Align dst on a 16-byte boundary */
729     while (w &&
730            ((unsigned long)pd & 15))
731     {
732         d = *pd;
733         s = combine1 (ps, pm);
734
735         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
736         w--;
737         ps++;
738         if (pm)
739             pm++;
740     }
741
742     while (w >= 4)
743     {
744         /* I'm loading unaligned because I'm not sure
745          * about the address alignment.
746          */
747         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
748         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
749
750         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
751         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
752
753         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
754                             &xmm_alpha_lo, &xmm_alpha_hi);
755
756         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
757                     &xmm_alpha_lo, &xmm_alpha_hi,
758                     &xmm_src_lo, &xmm_src_hi);
759
760         /* rebuid the 4 pixel data and save*/
761         save_128_aligned ((__m128i*)pd,
762                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
763
764         w -= 4;
765         ps += 4;
766         pd += 4;
767
768         if (pm)
769             pm += 4;
770     }
771
772     while (w)
773     {
774         d = *pd;
775         s = combine1 (ps, pm);
776
777         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
778         ps++;
779         w--;
780         if (pm)
781             pm++;
782     }
783 }
784
785 static force_inline uint32_t
786 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
787 {
788     uint32_t maska = src >> 24;
789
790     if (maska == 0)
791     {
792         return 0;
793     }
794     else if (maska != 0xff)
795     {
796         return pack_1x128_32 (
797             pix_multiply_1x128 (unpack_32_1x128 (dst),
798                                 expand_alpha_1x128 (unpack_32_1x128 (src))));
799     }
800
801     return dst;
802 }
803
804 static void
805 sse2_combine_in_u (pixman_implementation_t *imp,
806                    pixman_op_t              op,
807                    uint32_t *               pd,
808                    const uint32_t *         ps,
809                    const uint32_t *         pm,
810                    int                      w)
811 {
812     uint32_t s, d;
813
814     __m128i xmm_src_lo, xmm_src_hi;
815     __m128i xmm_dst_lo, xmm_dst_hi;
816
817     while (w && ((unsigned long) pd & 15))
818     {
819         s = combine1 (ps, pm);
820         d = *pd;
821
822         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
823         w--;
824         ps++;
825         if (pm)
826             pm++;
827     }
828
829     while (w >= 4)
830     {
831         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
832         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
833
834         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
835         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
836
837         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
838         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
839                             &xmm_dst_lo, &xmm_dst_hi,
840                             &xmm_dst_lo, &xmm_dst_hi);
841
842         save_128_aligned ((__m128i*)pd,
843                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
844
845         ps += 4;
846         pd += 4;
847         w -= 4;
848         if (pm)
849             pm += 4;
850     }
851
852     while (w)
853     {
854         s = combine1 (ps, pm);
855         d = *pd;
856
857         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
858         w--;
859         ps++;
860         if (pm)
861             pm++;
862     }
863 }
864
865 static void
866 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
867                            pixman_op_t              op,
868                            uint32_t *               pd,
869                            const uint32_t *         ps,
870                            const uint32_t *         pm,
871                            int                      w)
872 {
873     uint32_t s, d;
874
875     __m128i xmm_src_lo, xmm_src_hi;
876     __m128i xmm_dst_lo, xmm_dst_hi;
877
878     while (w && ((unsigned long) pd & 15))
879     {
880         s = combine1 (ps, pm);
881         d = *pd;
882
883         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
884         ps++;
885         w--;
886         if (pm)
887             pm++;
888     }
889
890     while (w >= 4)
891     {
892         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
893         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
894
895         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
896         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
897
898         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
899         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
900                             &xmm_src_lo, &xmm_src_hi,
901                             &xmm_dst_lo, &xmm_dst_hi);
902
903         save_128_aligned (
904             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
905
906         ps += 4;
907         pd += 4;
908         w -= 4;
909         if (pm)
910             pm += 4;
911     }
912
913     while (w)
914     {
915         s = combine1 (ps, pm);
916         d = *pd;
917
918         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
919         w--;
920         ps++;
921         if (pm)
922             pm++;
923     }
924 }
925
926 static void
927 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
928                             pixman_op_t              op,
929                             uint32_t *               pd,
930                             const uint32_t *         ps,
931                             const uint32_t *         pm,
932                             int                      w)
933 {
934     while (w && ((unsigned long) pd & 15))
935     {
936         uint32_t s = combine1 (ps, pm);
937         uint32_t d = *pd;
938
939         *pd++ = pack_1x128_32 (
940             pix_multiply_1x128 (
941                 unpack_32_1x128 (d), negate_1x128 (
942                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
943
944         if (pm)
945             pm++;
946         ps++;
947         w--;
948     }
949
950     while (w >= 4)
951     {
952         __m128i xmm_src_lo, xmm_src_hi;
953         __m128i xmm_dst_lo, xmm_dst_hi;
954
955         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
956         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
957
958         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
959         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
960
961         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
962         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
963
964         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
965                             &xmm_src_lo, &xmm_src_hi,
966                             &xmm_dst_lo, &xmm_dst_hi);
967
968         save_128_aligned (
969             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
970
971         ps += 4;
972         pd += 4;
973         if (pm)
974             pm += 4;
975
976         w -= 4;
977     }
978
979     while (w)
980     {
981         uint32_t s = combine1 (ps, pm);
982         uint32_t d = *pd;
983
984         *pd++ = pack_1x128_32 (
985             pix_multiply_1x128 (
986                 unpack_32_1x128 (d), negate_1x128 (
987                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
988         ps++;
989         if (pm)
990             pm++;
991         w--;
992     }
993 }
994
995 static void
996 sse2_combine_out_u (pixman_implementation_t *imp,
997                     pixman_op_t              op,
998                     uint32_t *               pd,
999                     const uint32_t *         ps,
1000                     const uint32_t *         pm,
1001                     int                      w)
1002 {
1003     while (w && ((unsigned long) pd & 15))
1004     {
1005         uint32_t s = combine1 (ps, pm);
1006         uint32_t d = *pd;
1007
1008         *pd++ = pack_1x128_32 (
1009             pix_multiply_1x128 (
1010                 unpack_32_1x128 (s), negate_1x128 (
1011                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1012         w--;
1013         ps++;
1014         if (pm)
1015             pm++;
1016     }
1017
1018     while (w >= 4)
1019     {
1020         __m128i xmm_src_lo, xmm_src_hi;
1021         __m128i xmm_dst_lo, xmm_dst_hi;
1022
1023         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1024         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1025
1026         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1027         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1028
1029         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1030         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1031
1032         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1033                             &xmm_dst_lo, &xmm_dst_hi,
1034                             &xmm_dst_lo, &xmm_dst_hi);
1035
1036         save_128_aligned (
1037             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1038
1039         ps += 4;
1040         pd += 4;
1041         w -= 4;
1042         if (pm)
1043             pm += 4;
1044     }
1045
1046     while (w)
1047     {
1048         uint32_t s = combine1 (ps, pm);
1049         uint32_t d = *pd;
1050
1051         *pd++ = pack_1x128_32 (
1052             pix_multiply_1x128 (
1053                 unpack_32_1x128 (s), negate_1x128 (
1054                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1055         w--;
1056         ps++;
1057         if (pm)
1058             pm++;
1059     }
1060 }
1061
1062 static force_inline uint32_t
1063 core_combine_atop_u_pixel_sse2 (uint32_t src,
1064                                 uint32_t dst)
1065 {
1066     __m128i s = unpack_32_1x128 (src);
1067     __m128i d = unpack_32_1x128 (dst);
1068
1069     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1070     __m128i da = expand_alpha_1x128 (d);
1071
1072     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1073 }
1074
1075 static void
1076 sse2_combine_atop_u (pixman_implementation_t *imp,
1077                      pixman_op_t              op,
1078                      uint32_t *               pd,
1079                      const uint32_t *         ps,
1080                      const uint32_t *         pm,
1081                      int                      w)
1082 {
1083     uint32_t s, d;
1084
1085     __m128i xmm_src_lo, xmm_src_hi;
1086     __m128i xmm_dst_lo, xmm_dst_hi;
1087     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1088     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1089
1090     while (w && ((unsigned long) pd & 15))
1091     {
1092         s = combine1 (ps, pm);
1093         d = *pd;
1094
1095         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1096         w--;
1097         ps++;
1098         if (pm)
1099             pm++;
1100     }
1101
1102     while (w >= 4)
1103     {
1104         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1105         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1106
1107         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1108         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1109
1110         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1111                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1112         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1113                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1114
1115         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1116                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1117
1118         pix_add_multiply_2x128 (
1119             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1120             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1121             &xmm_dst_lo, &xmm_dst_hi);
1122
1123         save_128_aligned (
1124             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1125
1126         ps += 4;
1127         pd += 4;
1128         w -= 4;
1129         if (pm)
1130             pm += 4;
1131     }
1132
1133     while (w)
1134     {
1135         s = combine1 (ps, pm);
1136         d = *pd;
1137
1138         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1139         w--;
1140         ps++;
1141         if (pm)
1142             pm++;
1143     }
1144 }
1145
1146 static force_inline uint32_t
1147 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1148                                         uint32_t dst)
1149 {
1150     __m128i s = unpack_32_1x128 (src);
1151     __m128i d = unpack_32_1x128 (dst);
1152
1153     __m128i sa = expand_alpha_1x128 (s);
1154     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1155
1156     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1157 }
1158
1159 static void
1160 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1161                              pixman_op_t              op,
1162                              uint32_t *               pd,
1163                              const uint32_t *         ps,
1164                              const uint32_t *         pm,
1165                              int                      w)
1166 {
1167     uint32_t s, d;
1168
1169     __m128i xmm_src_lo, xmm_src_hi;
1170     __m128i xmm_dst_lo, xmm_dst_hi;
1171     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1172     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1173
1174     while (w && ((unsigned long) pd & 15))
1175     {
1176         s = combine1 (ps, pm);
1177         d = *pd;
1178
1179         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1180         ps++;
1181         w--;
1182         if (pm)
1183             pm++;
1184     }
1185
1186     while (w >= 4)
1187     {
1188         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1189         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1190
1191         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1192         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1193
1194         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1195                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1196         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1197                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1198
1199         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1200                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1201
1202         pix_add_multiply_2x128 (
1203             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1204             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1205             &xmm_dst_lo, &xmm_dst_hi);
1206
1207         save_128_aligned (
1208             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1209
1210         ps += 4;
1211         pd += 4;
1212         w -= 4;
1213         if (pm)
1214             pm += 4;
1215     }
1216
1217     while (w)
1218     {
1219         s = combine1 (ps, pm);
1220         d = *pd;
1221
1222         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1223         ps++;
1224         w--;
1225         if (pm)
1226             pm++;
1227     }
1228 }
1229
1230 static force_inline uint32_t
1231 core_combine_xor_u_pixel_sse2 (uint32_t src,
1232                                uint32_t dst)
1233 {
1234     __m128i s = unpack_32_1x128 (src);
1235     __m128i d = unpack_32_1x128 (dst);
1236
1237     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1238     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1239
1240     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1241 }
1242
1243 static void
1244 sse2_combine_xor_u (pixman_implementation_t *imp,
1245                     pixman_op_t              op,
1246                     uint32_t *               dst,
1247                     const uint32_t *         src,
1248                     const uint32_t *         mask,
1249                     int                      width)
1250 {
1251     int w = width;
1252     uint32_t s, d;
1253     uint32_t* pd = dst;
1254     const uint32_t* ps = src;
1255     const uint32_t* pm = mask;
1256
1257     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1258     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1259     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1260     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1261
1262     while (w && ((unsigned long) pd & 15))
1263     {
1264         s = combine1 (ps, pm);
1265         d = *pd;
1266
1267         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1268         w--;
1269         ps++;
1270         if (pm)
1271             pm++;
1272     }
1273
1274     while (w >= 4)
1275     {
1276         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1277         xmm_dst = load_128_aligned ((__m128i*) pd);
1278
1279         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1280         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1281
1282         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1283                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1284         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1285                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1286
1287         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1288                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1289         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1290                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1291
1292         pix_add_multiply_2x128 (
1293             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1294             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1295             &xmm_dst_lo, &xmm_dst_hi);
1296
1297         save_128_aligned (
1298             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1299
1300         ps += 4;
1301         pd += 4;
1302         w -= 4;
1303         if (pm)
1304             pm += 4;
1305     }
1306
1307     while (w)
1308     {
1309         s = combine1 (ps, pm);
1310         d = *pd;
1311
1312         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1313         w--;
1314         ps++;
1315         if (pm)
1316             pm++;
1317     }
1318 }
1319
1320 static force_inline void
1321 sse2_combine_add_u (pixman_implementation_t *imp,
1322                     pixman_op_t              op,
1323                     uint32_t *               dst,
1324                     const uint32_t *         src,
1325                     const uint32_t *         mask,
1326                     int                      width)
1327 {
1328     int w = width;
1329     uint32_t s, d;
1330     uint32_t* pd = dst;
1331     const uint32_t* ps = src;
1332     const uint32_t* pm = mask;
1333
1334     while (w && (unsigned long)pd & 15)
1335     {
1336         s = combine1 (ps, pm);
1337         d = *pd;
1338
1339         ps++;
1340         if (pm)
1341             pm++;
1342         *pd++ = _mm_cvtsi128_si32 (
1343             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1344         w--;
1345     }
1346
1347     while (w >= 4)
1348     {
1349         __m128i s;
1350
1351         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1352
1353         save_128_aligned (
1354             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1355
1356         pd += 4;
1357         ps += 4;
1358         if (pm)
1359             pm += 4;
1360         w -= 4;
1361     }
1362
1363     while (w--)
1364     {
1365         s = combine1 (ps, pm);
1366         d = *pd;
1367
1368         ps++;
1369         *pd++ = _mm_cvtsi128_si32 (
1370             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1371         if (pm)
1372             pm++;
1373     }
1374 }
1375
1376 static force_inline uint32_t
1377 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1378                                     uint32_t dst)
1379 {
1380     __m128i ms = unpack_32_1x128 (src);
1381     __m128i md = unpack_32_1x128 (dst);
1382     uint32_t sa = src >> 24;
1383     uint32_t da = ~dst >> 24;
1384
1385     if (sa > da)
1386     {
1387         ms = pix_multiply_1x128 (
1388             ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1389     }
1390
1391     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1392 }
1393
1394 static void
1395 sse2_combine_saturate_u (pixman_implementation_t *imp,
1396                          pixman_op_t              op,
1397                          uint32_t *               pd,
1398                          const uint32_t *         ps,
1399                          const uint32_t *         pm,
1400                          int                      w)
1401 {
1402     uint32_t s, d;
1403
1404     uint32_t pack_cmp;
1405     __m128i xmm_src, xmm_dst;
1406
1407     while (w && (unsigned long)pd & 15)
1408     {
1409         s = combine1 (ps, pm);
1410         d = *pd;
1411
1412         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1413         w--;
1414         ps++;
1415         if (pm)
1416             pm++;
1417     }
1418
1419     while (w >= 4)
1420     {
1421         xmm_dst = load_128_aligned  ((__m128i*)pd);
1422         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1423
1424         pack_cmp = _mm_movemask_epi8 (
1425             _mm_cmpgt_epi32 (
1426                 _mm_srli_epi32 (xmm_src, 24),
1427                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1428
1429         /* if some alpha src is grater than respective ~alpha dst */
1430         if (pack_cmp)
1431         {
1432             s = combine1 (ps++, pm);
1433             d = *pd;
1434             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1435             if (pm)
1436                 pm++;
1437
1438             s = combine1 (ps++, pm);
1439             d = *pd;
1440             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1441             if (pm)
1442                 pm++;
1443
1444             s = combine1 (ps++, pm);
1445             d = *pd;
1446             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1447             if (pm)
1448                 pm++;
1449
1450             s = combine1 (ps++, pm);
1451             d = *pd;
1452             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1453             if (pm)
1454                 pm++;
1455         }
1456         else
1457         {
1458             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1459
1460             pd += 4;
1461             ps += 4;
1462             if (pm)
1463                 pm += 4;
1464         }
1465
1466         w -= 4;
1467     }
1468
1469     while (w--)
1470     {
1471         s = combine1 (ps, pm);
1472         d = *pd;
1473
1474         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1475         ps++;
1476         if (pm)
1477             pm++;
1478     }
1479 }
1480
1481 static void
1482 sse2_combine_src_ca (pixman_implementation_t *imp,
1483                      pixman_op_t              op,
1484                      uint32_t *               pd,
1485                      const uint32_t *         ps,
1486                      const uint32_t *         pm,
1487                      int                      w)
1488 {
1489     uint32_t s, m;
1490
1491     __m128i xmm_src_lo, xmm_src_hi;
1492     __m128i xmm_mask_lo, xmm_mask_hi;
1493     __m128i xmm_dst_lo, xmm_dst_hi;
1494
1495     while (w && (unsigned long)pd & 15)
1496     {
1497         s = *ps++;
1498         m = *pm++;
1499         *pd++ = pack_1x128_32 (
1500             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1501         w--;
1502     }
1503
1504     while (w >= 4)
1505     {
1506         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1507         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1508
1509         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1510         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1511
1512         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1513                             &xmm_mask_lo, &xmm_mask_hi,
1514                             &xmm_dst_lo, &xmm_dst_hi);
1515
1516         save_128_aligned (
1517             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1518
1519         ps += 4;
1520         pd += 4;
1521         pm += 4;
1522         w -= 4;
1523     }
1524
1525     while (w)
1526     {
1527         s = *ps++;
1528         m = *pm++;
1529         *pd++ = pack_1x128_32 (
1530             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1531         w--;
1532     }
1533 }
1534
1535 static force_inline uint32_t
1536 core_combine_over_ca_pixel_sse2 (uint32_t src,
1537                                  uint32_t mask,
1538                                  uint32_t dst)
1539 {
1540     __m128i s = unpack_32_1x128 (src);
1541     __m128i expAlpha = expand_alpha_1x128 (s);
1542     __m128i unpk_mask = unpack_32_1x128 (mask);
1543     __m128i unpk_dst  = unpack_32_1x128 (dst);
1544
1545     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1546 }
1547
1548 static void
1549 sse2_combine_over_ca (pixman_implementation_t *imp,
1550                       pixman_op_t              op,
1551                       uint32_t *               pd,
1552                       const uint32_t *         ps,
1553                       const uint32_t *         pm,
1554                       int                      w)
1555 {
1556     uint32_t s, m, d;
1557
1558     __m128i xmm_alpha_lo, xmm_alpha_hi;
1559     __m128i xmm_src_lo, xmm_src_hi;
1560     __m128i xmm_dst_lo, xmm_dst_hi;
1561     __m128i xmm_mask_lo, xmm_mask_hi;
1562
1563     while (w && (unsigned long)pd & 15)
1564     {
1565         s = *ps++;
1566         m = *pm++;
1567         d = *pd;
1568
1569         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1570         w--;
1571     }
1572
1573     while (w >= 4)
1574     {
1575         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1576         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1577         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1578
1579         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1580         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1581         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1582
1583         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1584                             &xmm_alpha_lo, &xmm_alpha_hi);
1585
1586         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1587                        &xmm_alpha_lo, &xmm_alpha_hi,
1588                        &xmm_mask_lo, &xmm_mask_hi,
1589                        &xmm_dst_lo, &xmm_dst_hi);
1590
1591         save_128_aligned (
1592             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1593
1594         ps += 4;
1595         pd += 4;
1596         pm += 4;
1597         w -= 4;
1598     }
1599
1600     while (w)
1601     {
1602         s = *ps++;
1603         m = *pm++;
1604         d = *pd;
1605
1606         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1607         w--;
1608     }
1609 }
1610
1611 static force_inline uint32_t
1612 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1613                                          uint32_t mask,
1614                                          uint32_t dst)
1615 {
1616     __m128i d = unpack_32_1x128 (dst);
1617
1618     return pack_1x128_32 (
1619         over_1x128 (d, expand_alpha_1x128 (d),
1620                     pix_multiply_1x128 (unpack_32_1x128 (src),
1621                                         unpack_32_1x128 (mask))));
1622 }
1623
1624 static void
1625 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1626                               pixman_op_t              op,
1627                               uint32_t *               pd,
1628                               const uint32_t *         ps,
1629                               const uint32_t *         pm,
1630                               int                      w)
1631 {
1632     uint32_t s, m, d;
1633
1634     __m128i xmm_alpha_lo, xmm_alpha_hi;
1635     __m128i xmm_src_lo, xmm_src_hi;
1636     __m128i xmm_dst_lo, xmm_dst_hi;
1637     __m128i xmm_mask_lo, xmm_mask_hi;
1638
1639     while (w && (unsigned long)pd & 15)
1640     {
1641         s = *ps++;
1642         m = *pm++;
1643         d = *pd;
1644
1645         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1646         w--;
1647     }
1648
1649     while (w >= 4)
1650     {
1651         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1652         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1653         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1654
1655         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1656         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1657         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1658
1659         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1660                             &xmm_alpha_lo, &xmm_alpha_hi);
1661         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1662                             &xmm_mask_lo, &xmm_mask_hi,
1663                             &xmm_mask_lo, &xmm_mask_hi);
1664
1665         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1666                     &xmm_alpha_lo, &xmm_alpha_hi,
1667                     &xmm_mask_lo, &xmm_mask_hi);
1668
1669         save_128_aligned (
1670             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1671
1672         ps += 4;
1673         pd += 4;
1674         pm += 4;
1675         w -= 4;
1676     }
1677
1678     while (w)
1679     {
1680         s = *ps++;
1681         m = *pm++;
1682         d = *pd;
1683
1684         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1685         w--;
1686     }
1687 }
1688
1689 static void
1690 sse2_combine_in_ca (pixman_implementation_t *imp,
1691                     pixman_op_t              op,
1692                     uint32_t *               pd,
1693                     const uint32_t *         ps,
1694                     const uint32_t *         pm,
1695                     int                      w)
1696 {
1697     uint32_t s, m, d;
1698
1699     __m128i xmm_alpha_lo, xmm_alpha_hi;
1700     __m128i xmm_src_lo, xmm_src_hi;
1701     __m128i xmm_dst_lo, xmm_dst_hi;
1702     __m128i xmm_mask_lo, xmm_mask_hi;
1703
1704     while (w && (unsigned long)pd & 15)
1705     {
1706         s = *ps++;
1707         m = *pm++;
1708         d = *pd;
1709
1710         *pd++ = pack_1x128_32 (
1711             pix_multiply_1x128 (
1712                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1713                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1714
1715         w--;
1716     }
1717
1718     while (w >= 4)
1719     {
1720         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1721         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1722         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1723
1724         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1725         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1726         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1727
1728         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1729                             &xmm_alpha_lo, &xmm_alpha_hi);
1730
1731         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1732                             &xmm_mask_lo, &xmm_mask_hi,
1733                             &xmm_dst_lo, &xmm_dst_hi);
1734
1735         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1736                             &xmm_alpha_lo, &xmm_alpha_hi,
1737                             &xmm_dst_lo, &xmm_dst_hi);
1738
1739         save_128_aligned (
1740             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1741
1742         ps += 4;
1743         pd += 4;
1744         pm += 4;
1745         w -= 4;
1746     }
1747
1748     while (w)
1749     {
1750         s = *ps++;
1751         m = *pm++;
1752         d = *pd;
1753
1754         *pd++ = pack_1x128_32 (
1755             pix_multiply_1x128 (
1756                 pix_multiply_1x128 (
1757                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1758                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1759
1760         w--;
1761     }
1762 }
1763
1764 static void
1765 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1766                             pixman_op_t              op,
1767                             uint32_t *               pd,
1768                             const uint32_t *         ps,
1769                             const uint32_t *         pm,
1770                             int                      w)
1771 {
1772     uint32_t s, m, d;
1773
1774     __m128i xmm_alpha_lo, xmm_alpha_hi;
1775     __m128i xmm_src_lo, xmm_src_hi;
1776     __m128i xmm_dst_lo, xmm_dst_hi;
1777     __m128i xmm_mask_lo, xmm_mask_hi;
1778
1779     while (w && (unsigned long)pd & 15)
1780     {
1781         s = *ps++;
1782         m = *pm++;
1783         d = *pd;
1784
1785         *pd++ = pack_1x128_32 (
1786             pix_multiply_1x128 (
1787                 unpack_32_1x128 (d),
1788                 pix_multiply_1x128 (unpack_32_1x128 (m),
1789                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1790         w--;
1791     }
1792
1793     while (w >= 4)
1794     {
1795         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1796         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1797         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1798
1799         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1800         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1801         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1802
1803         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1804                             &xmm_alpha_lo, &xmm_alpha_hi);
1805         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1806                             &xmm_alpha_lo, &xmm_alpha_hi,
1807                             &xmm_alpha_lo, &xmm_alpha_hi);
1808
1809         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1810                             &xmm_alpha_lo, &xmm_alpha_hi,
1811                             &xmm_dst_lo, &xmm_dst_hi);
1812
1813         save_128_aligned (
1814             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1815
1816         ps += 4;
1817         pd += 4;
1818         pm += 4;
1819         w -= 4;
1820     }
1821
1822     while (w)
1823     {
1824         s = *ps++;
1825         m = *pm++;
1826         d = *pd;
1827
1828         *pd++ = pack_1x128_32 (
1829             pix_multiply_1x128 (
1830                 unpack_32_1x128 (d),
1831                 pix_multiply_1x128 (unpack_32_1x128 (m),
1832                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1833         w--;
1834     }
1835 }
1836
1837 static void
1838 sse2_combine_out_ca (pixman_implementation_t *imp,
1839                      pixman_op_t              op,
1840                      uint32_t *               pd,
1841                      const uint32_t *         ps,
1842                      const uint32_t *         pm,
1843                      int                      w)
1844 {
1845     uint32_t s, m, d;
1846
1847     __m128i xmm_alpha_lo, xmm_alpha_hi;
1848     __m128i xmm_src_lo, xmm_src_hi;
1849     __m128i xmm_dst_lo, xmm_dst_hi;
1850     __m128i xmm_mask_lo, xmm_mask_hi;
1851
1852     while (w && (unsigned long)pd & 15)
1853     {
1854         s = *ps++;
1855         m = *pm++;
1856         d = *pd;
1857
1858         *pd++ = pack_1x128_32 (
1859             pix_multiply_1x128 (
1860                 pix_multiply_1x128 (
1861                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1862                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1863         w--;
1864     }
1865
1866     while (w >= 4)
1867     {
1868         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1869         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1870         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1871
1872         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1873         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1874         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1875
1876         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1877                             &xmm_alpha_lo, &xmm_alpha_hi);
1878         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1879                       &xmm_alpha_lo, &xmm_alpha_hi);
1880
1881         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1882                             &xmm_mask_lo, &xmm_mask_hi,
1883                             &xmm_dst_lo, &xmm_dst_hi);
1884         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1885                             &xmm_alpha_lo, &xmm_alpha_hi,
1886                             &xmm_dst_lo, &xmm_dst_hi);
1887
1888         save_128_aligned (
1889             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1890
1891         ps += 4;
1892         pd += 4;
1893         pm += 4;
1894         w -= 4;
1895     }
1896
1897     while (w)
1898     {
1899         s = *ps++;
1900         m = *pm++;
1901         d = *pd;
1902
1903         *pd++ = pack_1x128_32 (
1904             pix_multiply_1x128 (
1905                 pix_multiply_1x128 (
1906                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1907                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1908
1909         w--;
1910     }
1911 }
1912
1913 static void
1914 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1915                              pixman_op_t              op,
1916                              uint32_t *               pd,
1917                              const uint32_t *         ps,
1918                              const uint32_t *         pm,
1919                              int                      w)
1920 {
1921     uint32_t s, m, d;
1922
1923     __m128i xmm_alpha_lo, xmm_alpha_hi;
1924     __m128i xmm_src_lo, xmm_src_hi;
1925     __m128i xmm_dst_lo, xmm_dst_hi;
1926     __m128i xmm_mask_lo, xmm_mask_hi;
1927
1928     while (w && (unsigned long)pd & 15)
1929     {
1930         s = *ps++;
1931         m = *pm++;
1932         d = *pd;
1933
1934         *pd++ = pack_1x128_32 (
1935             pix_multiply_1x128 (
1936                 unpack_32_1x128 (d),
1937                 negate_1x128 (pix_multiply_1x128 (
1938                                  unpack_32_1x128 (m),
1939                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1940         w--;
1941     }
1942
1943     while (w >= 4)
1944     {
1945         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1946         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1947         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1948
1949         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1950         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1951         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1952
1953         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1954                             &xmm_alpha_lo, &xmm_alpha_hi);
1955
1956         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1957                             &xmm_alpha_lo, &xmm_alpha_hi,
1958                             &xmm_mask_lo, &xmm_mask_hi);
1959
1960         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1961                       &xmm_mask_lo, &xmm_mask_hi);
1962
1963         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1964                             &xmm_mask_lo, &xmm_mask_hi,
1965                             &xmm_dst_lo, &xmm_dst_hi);
1966
1967         save_128_aligned (
1968             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1969
1970         ps += 4;
1971         pd += 4;
1972         pm += 4;
1973         w -= 4;
1974     }
1975
1976     while (w)
1977     {
1978         s = *ps++;
1979         m = *pm++;
1980         d = *pd;
1981
1982         *pd++ = pack_1x128_32 (
1983             pix_multiply_1x128 (
1984                 unpack_32_1x128 (d),
1985                 negate_1x128 (pix_multiply_1x128 (
1986                                  unpack_32_1x128 (m),
1987                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1988         w--;
1989     }
1990 }
1991
1992 static force_inline uint32_t
1993 core_combine_atop_ca_pixel_sse2 (uint32_t src,
1994                                  uint32_t mask,
1995                                  uint32_t dst)
1996 {
1997     __m128i m = unpack_32_1x128 (mask);
1998     __m128i s = unpack_32_1x128 (src);
1999     __m128i d = unpack_32_1x128 (dst);
2000     __m128i sa = expand_alpha_1x128 (s);
2001     __m128i da = expand_alpha_1x128 (d);
2002
2003     s = pix_multiply_1x128 (s, m);
2004     m = negate_1x128 (pix_multiply_1x128 (m, sa));
2005
2006     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2007 }
2008
2009 static void
2010 sse2_combine_atop_ca (pixman_implementation_t *imp,
2011                       pixman_op_t              op,
2012                       uint32_t *               pd,
2013                       const uint32_t *         ps,
2014                       const uint32_t *         pm,
2015                       int                      w)
2016 {
2017     uint32_t s, m, d;
2018
2019     __m128i xmm_src_lo, xmm_src_hi;
2020     __m128i xmm_dst_lo, xmm_dst_hi;
2021     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2022     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2023     __m128i xmm_mask_lo, xmm_mask_hi;
2024
2025     while (w && (unsigned long)pd & 15)
2026     {
2027         s = *ps++;
2028         m = *pm++;
2029         d = *pd;
2030
2031         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2032         w--;
2033     }
2034
2035     while (w >= 4)
2036     {
2037         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2038         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2039         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2040
2041         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2042         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2043         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2044
2045         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2046                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2047         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2048                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2049
2050         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2051                             &xmm_mask_lo, &xmm_mask_hi,
2052                             &xmm_src_lo, &xmm_src_hi);
2053         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2054                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2055                             &xmm_mask_lo, &xmm_mask_hi);
2056
2057         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2058
2059         pix_add_multiply_2x128 (
2060             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2061             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2062             &xmm_dst_lo, &xmm_dst_hi);
2063
2064         save_128_aligned (
2065             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2066
2067         ps += 4;
2068         pd += 4;
2069         pm += 4;
2070         w -= 4;
2071     }
2072
2073     while (w)
2074     {
2075         s = *ps++;
2076         m = *pm++;
2077         d = *pd;
2078
2079         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2080         w--;
2081     }
2082 }
2083
2084 static force_inline uint32_t
2085 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2086                                          uint32_t mask,
2087                                          uint32_t dst)
2088 {
2089     __m128i m = unpack_32_1x128 (mask);
2090     __m128i s = unpack_32_1x128 (src);
2091     __m128i d = unpack_32_1x128 (dst);
2092
2093     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2094     __m128i sa = expand_alpha_1x128 (s);
2095
2096     s = pix_multiply_1x128 (s, m);
2097     m = pix_multiply_1x128 (m, sa);
2098
2099     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2100 }
2101
2102 static void
2103 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2104                               pixman_op_t              op,
2105                               uint32_t *               pd,
2106                               const uint32_t *         ps,
2107                               const uint32_t *         pm,
2108                               int                      w)
2109 {
2110     uint32_t s, m, d;
2111
2112     __m128i xmm_src_lo, xmm_src_hi;
2113     __m128i xmm_dst_lo, xmm_dst_hi;
2114     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2115     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2116     __m128i xmm_mask_lo, xmm_mask_hi;
2117
2118     while (w && (unsigned long)pd & 15)
2119     {
2120         s = *ps++;
2121         m = *pm++;
2122         d = *pd;
2123
2124         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2125         w--;
2126     }
2127
2128     while (w >= 4)
2129     {
2130         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2131         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2132         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2133
2134         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2135         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2136         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2137
2138         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2139                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2140         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2141                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2142
2143         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2144                             &xmm_mask_lo, &xmm_mask_hi,
2145                             &xmm_src_lo, &xmm_src_hi);
2146         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2147                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2148                             &xmm_mask_lo, &xmm_mask_hi);
2149
2150         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2151                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2152
2153         pix_add_multiply_2x128 (
2154             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2155             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2156             &xmm_dst_lo, &xmm_dst_hi);
2157
2158         save_128_aligned (
2159             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2160
2161         ps += 4;
2162         pd += 4;
2163         pm += 4;
2164         w -= 4;
2165     }
2166
2167     while (w)
2168     {
2169         s = *ps++;
2170         m = *pm++;
2171         d = *pd;
2172
2173         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2174         w--;
2175     }
2176 }
2177
2178 static force_inline uint32_t
2179 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2180                                 uint32_t mask,
2181                                 uint32_t dst)
2182 {
2183     __m128i a = unpack_32_1x128 (mask);
2184     __m128i s = unpack_32_1x128 (src);
2185     __m128i d = unpack_32_1x128 (dst);
2186
2187     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2188                                        a, expand_alpha_1x128 (s)));
2189     __m128i dest      = pix_multiply_1x128 (s, a);
2190     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2191
2192     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2193                                                 &alpha_dst,
2194                                                 &dest,
2195                                                 &alpha_src));
2196 }
2197
2198 static void
2199 sse2_combine_xor_ca (pixman_implementation_t *imp,
2200                      pixman_op_t              op,
2201                      uint32_t *               pd,
2202                      const uint32_t *         ps,
2203                      const uint32_t *         pm,
2204                      int                      w)
2205 {
2206     uint32_t s, m, d;
2207
2208     __m128i xmm_src_lo, xmm_src_hi;
2209     __m128i xmm_dst_lo, xmm_dst_hi;
2210     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2211     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2212     __m128i xmm_mask_lo, xmm_mask_hi;
2213
2214     while (w && (unsigned long)pd & 15)
2215     {
2216         s = *ps++;
2217         m = *pm++;
2218         d = *pd;
2219
2220         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2221         w--;
2222     }
2223
2224     while (w >= 4)
2225     {
2226         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2227         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2228         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2229
2230         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2231         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2232         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2233
2234         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2235                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2236         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2237                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2238
2239         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2240                             &xmm_mask_lo, &xmm_mask_hi,
2241                             &xmm_src_lo, &xmm_src_hi);
2242         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2243                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2244                             &xmm_mask_lo, &xmm_mask_hi);
2245
2246         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2247                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2248         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2249                       &xmm_mask_lo, &xmm_mask_hi);
2250
2251         pix_add_multiply_2x128 (
2252             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2253             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2254             &xmm_dst_lo, &xmm_dst_hi);
2255
2256         save_128_aligned (
2257             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2258
2259         ps += 4;
2260         pd += 4;
2261         pm += 4;
2262         w -= 4;
2263     }
2264
2265     while (w)
2266     {
2267         s = *ps++;
2268         m = *pm++;
2269         d = *pd;
2270
2271         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2272         w--;
2273     }
2274 }
2275
2276 static void
2277 sse2_combine_add_ca (pixman_implementation_t *imp,
2278                      pixman_op_t              op,
2279                      uint32_t *               pd,
2280                      const uint32_t *         ps,
2281                      const uint32_t *         pm,
2282                      int                      w)
2283 {
2284     uint32_t s, m, d;
2285
2286     __m128i xmm_src_lo, xmm_src_hi;
2287     __m128i xmm_dst_lo, xmm_dst_hi;
2288     __m128i xmm_mask_lo, xmm_mask_hi;
2289
2290     while (w && (unsigned long)pd & 15)
2291     {
2292         s = *ps++;
2293         m = *pm++;
2294         d = *pd;
2295
2296         *pd++ = pack_1x128_32 (
2297             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2298                                                unpack_32_1x128 (m)),
2299                            unpack_32_1x128 (d)));
2300         w--;
2301     }
2302
2303     while (w >= 4)
2304     {
2305         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2306         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2307         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2308
2309         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2310         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2311         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2312
2313         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2314                             &xmm_mask_lo, &xmm_mask_hi,
2315                             &xmm_src_lo, &xmm_src_hi);
2316
2317         save_128_aligned (
2318             (__m128i*)pd, pack_2x128_128 (
2319                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2320                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2321
2322         ps += 4;
2323         pd += 4;
2324         pm += 4;
2325         w -= 4;
2326     }
2327
2328     while (w)
2329     {
2330         s = *ps++;
2331         m = *pm++;
2332         d = *pd;
2333
2334         *pd++ = pack_1x128_32 (
2335             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2336                                                unpack_32_1x128 (m)),
2337                            unpack_32_1x128 (d)));
2338         w--;
2339     }
2340 }
2341
2342 static force_inline __m128i
2343 create_mask_16_128 (uint16_t mask)
2344 {
2345     return _mm_set1_epi16 (mask);
2346 }
2347
2348 /* Work around a code generation bug in Sun Studio 12. */
2349 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2350 # define create_mask_2x32_128(mask0, mask1)                             \
2351     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2352 #else
2353 static force_inline __m128i
2354 create_mask_2x32_128 (uint32_t mask0,
2355                       uint32_t mask1)
2356 {
2357     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2358 }
2359 #endif
2360
2361 static void
2362 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2363                             pixman_op_t              op,
2364                             pixman_image_t *         src_image,
2365                             pixman_image_t *         mask_image,
2366                             pixman_image_t *         dest_image,
2367                             int32_t                  src_x,
2368                             int32_t                  src_y,
2369                             int32_t                  mask_x,
2370                             int32_t                  mask_y,
2371                             int32_t                  dest_x,
2372                             int32_t                  dest_y,
2373                             int32_t                  width,
2374                             int32_t                  height)
2375 {
2376     uint32_t src;
2377     uint32_t    *dst_line, *dst, d;
2378     int32_t w;
2379     int dst_stride;
2380     __m128i xmm_src, xmm_alpha;
2381     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2382
2383     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2384
2385     if (src == 0)
2386         return;
2387
2388     PIXMAN_IMAGE_GET_LINE (
2389         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2390
2391     xmm_src = expand_pixel_32_1x128 (src);
2392     xmm_alpha = expand_alpha_1x128 (xmm_src);
2393
2394     while (height--)
2395     {
2396         dst = dst_line;
2397
2398         dst_line += dst_stride;
2399         w = width;
2400
2401         while (w && (unsigned long)dst & 15)
2402         {
2403             d = *dst;
2404             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2405                                                 xmm_alpha,
2406                                                 unpack_32_1x128 (d)));
2407             w--;
2408         }
2409
2410         while (w >= 4)
2411         {
2412             xmm_dst = load_128_aligned ((__m128i*)dst);
2413
2414             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2415
2416             over_2x128 (&xmm_src, &xmm_src,
2417                         &xmm_alpha, &xmm_alpha,
2418                         &xmm_dst_lo, &xmm_dst_hi);
2419
2420             /* rebuid the 4 pixel data and save*/
2421             save_128_aligned (
2422                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2423
2424             w -= 4;
2425             dst += 4;
2426         }
2427
2428         while (w)
2429         {
2430             d = *dst;
2431             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2432                                                 xmm_alpha,
2433                                                 unpack_32_1x128 (d)));
2434             w--;
2435         }
2436
2437     }
2438 }
2439
2440 static void
2441 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2442                             pixman_op_t              op,
2443                             pixman_image_t *         src_image,
2444                             pixman_image_t *         mask_image,
2445                             pixman_image_t *         dest_image,
2446                             int32_t                  src_x,
2447                             int32_t                  src_y,
2448                             int32_t                  mask_x,
2449                             int32_t                  mask_y,
2450                             int32_t                  dest_x,
2451                             int32_t                  dest_y,
2452                             int32_t                  width,
2453                             int32_t                  height)
2454 {
2455     uint32_t src;
2456     uint16_t    *dst_line, *dst, d;
2457     int32_t w;
2458     int dst_stride;
2459     __m128i xmm_src, xmm_alpha;
2460     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2461
2462     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2463
2464     if (src == 0)
2465         return;
2466
2467     PIXMAN_IMAGE_GET_LINE (
2468         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2469
2470     xmm_src = expand_pixel_32_1x128 (src);
2471     xmm_alpha = expand_alpha_1x128 (xmm_src);
2472
2473     while (height--)
2474     {
2475         dst = dst_line;
2476
2477         dst_line += dst_stride;
2478         w = width;
2479
2480         while (w && (unsigned long)dst & 15)
2481         {
2482             d = *dst;
2483
2484             *dst++ = pack_565_32_16 (
2485                 pack_1x128_32 (over_1x128 (xmm_src,
2486                                            xmm_alpha,
2487                                            expand565_16_1x128 (d))));
2488             w--;
2489         }
2490
2491         while (w >= 8)
2492         {
2493             xmm_dst = load_128_aligned ((__m128i*)dst);
2494
2495             unpack_565_128_4x128 (xmm_dst,
2496                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2497
2498             over_2x128 (&xmm_src, &xmm_src,
2499                         &xmm_alpha, &xmm_alpha,
2500                         &xmm_dst0, &xmm_dst1);
2501             over_2x128 (&xmm_src, &xmm_src,
2502                         &xmm_alpha, &xmm_alpha,
2503                         &xmm_dst2, &xmm_dst3);
2504
2505             xmm_dst = pack_565_4x128_128 (
2506                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507
2508             save_128_aligned ((__m128i*)dst, xmm_dst);
2509
2510             dst += 8;
2511             w -= 8;
2512         }
2513
2514         while (w--)
2515         {
2516             d = *dst;
2517             *dst++ = pack_565_32_16 (
2518                 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2519                                            expand565_16_1x128 (d))));
2520         }
2521     }
2522
2523 }
2524
2525 static void
2526 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2527                                    pixman_op_t              op,
2528                                    pixman_image_t *         src_image,
2529                                    pixman_image_t *         mask_image,
2530                                    pixman_image_t *         dest_image,
2531                                    int32_t                  src_x,
2532                                    int32_t                  src_y,
2533                                    int32_t                  mask_x,
2534                                    int32_t                  mask_y,
2535                                    int32_t                  dest_x,
2536                                    int32_t                  dest_y,
2537                                    int32_t                  width,
2538                                    int32_t                  height)
2539 {
2540     uint32_t src;
2541     uint32_t    *dst_line, d;
2542     uint32_t    *mask_line, m;
2543     uint32_t pack_cmp;
2544     int dst_stride, mask_stride;
2545
2546     __m128i xmm_src;
2547     __m128i xmm_dst;
2548     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2549
2550     __m128i mmx_src, mmx_mask, mmx_dest;
2551
2552     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2553
2554     if (src == 0)
2555         return;
2556
2557     PIXMAN_IMAGE_GET_LINE (
2558         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2559     PIXMAN_IMAGE_GET_LINE (
2560         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2561
2562     xmm_src = _mm_unpacklo_epi8 (
2563         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2564     mmx_src   = xmm_src;
2565
2566     while (height--)
2567     {
2568         int w = width;
2569         const uint32_t *pm = (uint32_t *)mask_line;
2570         uint32_t *pd = (uint32_t *)dst_line;
2571
2572         dst_line += dst_stride;
2573         mask_line += mask_stride;
2574
2575         while (w && (unsigned long)pd & 15)
2576         {
2577             m = *pm++;
2578
2579             if (m)
2580             {
2581                 d = *pd;
2582
2583                 mmx_mask = unpack_32_1x128 (m);
2584                 mmx_dest = unpack_32_1x128 (d);
2585
2586                 *pd = pack_1x128_32 (
2587                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2588                                    mmx_dest));
2589             }
2590
2591             pd++;
2592             w--;
2593         }
2594
2595         while (w >= 4)
2596         {
2597             xmm_mask = load_128_unaligned ((__m128i*)pm);
2598
2599             pack_cmp =
2600                 _mm_movemask_epi8 (
2601                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2602
2603             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2604             if (pack_cmp != 0xffff)
2605             {
2606                 xmm_dst = load_128_aligned ((__m128i*)pd);
2607
2608                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2609
2610                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2611                                     &xmm_mask_lo, &xmm_mask_hi,
2612                                     &xmm_mask_lo, &xmm_mask_hi);
2613                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2614
2615                 save_128_aligned (
2616                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2617             }
2618
2619             pd += 4;
2620             pm += 4;
2621             w -= 4;
2622         }
2623
2624         while (w)
2625         {
2626             m = *pm++;
2627
2628             if (m)
2629             {
2630                 d = *pd;
2631
2632                 mmx_mask = unpack_32_1x128 (m);
2633                 mmx_dest = unpack_32_1x128 (d);
2634
2635                 *pd = pack_1x128_32 (
2636                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2637                                    mmx_dest));
2638             }
2639
2640             pd++;
2641             w--;
2642         }
2643     }
2644
2645 }
2646
2647 static void
2648 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2649                                     pixman_op_t              op,
2650                                     pixman_image_t *         src_image,
2651                                     pixman_image_t *         mask_image,
2652                                     pixman_image_t *         dest_image,
2653                                     int32_t                  src_x,
2654                                     int32_t                  src_y,
2655                                     int32_t                  mask_x,
2656                                     int32_t                  mask_y,
2657                                     int32_t                  dest_x,
2658                                     int32_t                  dest_y,
2659                                     int32_t                  width,
2660                                     int32_t                  height)
2661 {
2662     uint32_t src;
2663     uint32_t    *dst_line, d;
2664     uint32_t    *mask_line, m;
2665     uint32_t pack_cmp;
2666     int dst_stride, mask_stride;
2667
2668     __m128i xmm_src, xmm_alpha;
2669     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2670     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2671
2672     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2673
2674     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2675
2676     if (src == 0)
2677         return;
2678
2679     PIXMAN_IMAGE_GET_LINE (
2680         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2681     PIXMAN_IMAGE_GET_LINE (
2682         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2683
2684     xmm_src = _mm_unpacklo_epi8 (
2685         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2686     xmm_alpha = expand_alpha_1x128 (xmm_src);
2687     mmx_src   = xmm_src;
2688     mmx_alpha = xmm_alpha;
2689
2690     while (height--)
2691     {
2692         int w = width;
2693         const uint32_t *pm = (uint32_t *)mask_line;
2694         uint32_t *pd = (uint32_t *)dst_line;
2695
2696         dst_line += dst_stride;
2697         mask_line += mask_stride;
2698
2699         while (w && (unsigned long)pd & 15)
2700         {
2701             m = *pm++;
2702
2703             if (m)
2704             {
2705                 d = *pd;
2706                 mmx_mask = unpack_32_1x128 (m);
2707                 mmx_dest = unpack_32_1x128 (d);
2708
2709                 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2710                                                   &mmx_alpha,
2711                                                   &mmx_mask,
2712                                                   &mmx_dest));
2713             }
2714
2715             pd++;
2716             w--;
2717         }
2718
2719         while (w >= 4)
2720         {
2721             xmm_mask = load_128_unaligned ((__m128i*)pm);
2722
2723             pack_cmp =
2724                 _mm_movemask_epi8 (
2725                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2726
2727             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2728             if (pack_cmp != 0xffff)
2729             {
2730                 xmm_dst = load_128_aligned ((__m128i*)pd);
2731
2732                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2733                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2734
2735                 in_over_2x128 (&xmm_src, &xmm_src,
2736                                &xmm_alpha, &xmm_alpha,
2737                                &xmm_mask_lo, &xmm_mask_hi,
2738                                &xmm_dst_lo, &xmm_dst_hi);
2739
2740                 save_128_aligned (
2741                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2742             }
2743
2744             pd += 4;
2745             pm += 4;
2746             w -= 4;
2747         }
2748
2749         while (w)
2750         {
2751             m = *pm++;
2752
2753             if (m)
2754             {
2755                 d = *pd;
2756                 mmx_mask = unpack_32_1x128 (m);
2757                 mmx_dest = unpack_32_1x128 (d);
2758
2759                 *pd = pack_1x128_32 (
2760                     in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2761             }
2762
2763             pd++;
2764             w--;
2765         }
2766     }
2767
2768 }
2769
2770 static void
2771 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2772                                  pixman_op_t              op,
2773                                  pixman_image_t *         src_image,
2774                                  pixman_image_t *         mask_image,
2775                                  pixman_image_t *         dest_image,
2776                                  int32_t                  src_x,
2777                                  int32_t                  src_y,
2778                                  int32_t                  mask_x,
2779                                  int32_t                  mask_y,
2780                                  int32_t                  dest_x,
2781                                  int32_t                  dest_y,
2782                                  int32_t                  width,
2783                                  int32_t                  height)
2784 {
2785     uint32_t    *dst_line, *dst;
2786     uint32_t    *src_line, *src;
2787     uint32_t mask;
2788     int32_t w;
2789     int dst_stride, src_stride;
2790
2791     __m128i xmm_mask;
2792     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2793     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2794     __m128i xmm_alpha_lo, xmm_alpha_hi;
2795
2796     PIXMAN_IMAGE_GET_LINE (
2797         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2798     PIXMAN_IMAGE_GET_LINE (
2799         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2800
2801     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2802
2803     xmm_mask = create_mask_16_128 (mask >> 24);
2804
2805     while (height--)
2806     {
2807         dst = dst_line;
2808         dst_line += dst_stride;
2809         src = src_line;
2810         src_line += src_stride;
2811         w = width;
2812
2813         while (w && (unsigned long)dst & 15)
2814         {
2815             uint32_t s = *src++;
2816
2817             if (s)
2818             {
2819                 uint32_t d = *dst;
2820                 
2821                 __m128i ms = unpack_32_1x128 (s);
2822                 __m128i alpha    = expand_alpha_1x128 (ms);
2823                 __m128i dest     = xmm_mask;
2824                 __m128i alpha_dst = unpack_32_1x128 (d);
2825                 
2826                 *dst = pack_1x128_32 (
2827                     in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2828             }
2829             dst++;
2830             w--;
2831         }
2832
2833         while (w >= 4)
2834         {
2835             xmm_src = load_128_unaligned ((__m128i*)src);
2836
2837             if (!is_zero (xmm_src))
2838             {
2839                 xmm_dst = load_128_aligned ((__m128i*)dst);
2840                 
2841                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2842                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2843                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2844                                     &xmm_alpha_lo, &xmm_alpha_hi);
2845                 
2846                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2847                                &xmm_alpha_lo, &xmm_alpha_hi,
2848                                &xmm_mask, &xmm_mask,
2849                                &xmm_dst_lo, &xmm_dst_hi);
2850                 
2851                 save_128_aligned (
2852                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2853             }
2854                 
2855             dst += 4;
2856             src += 4;
2857             w -= 4;
2858         }
2859
2860         while (w)
2861         {
2862             uint32_t s = *src++;
2863
2864             if (s)
2865             {
2866                 uint32_t d = *dst;
2867                 
2868                 __m128i ms = unpack_32_1x128 (s);
2869                 __m128i alpha = expand_alpha_1x128 (ms);
2870                 __m128i mask  = xmm_mask;
2871                 __m128i dest  = unpack_32_1x128 (d);
2872                 
2873                 *dst = pack_1x128_32 (
2874                     in_over_1x128 (&ms, &alpha, &mask, &dest));
2875             }
2876
2877             dst++;
2878             w--;
2879         }
2880     }
2881
2882 }
2883
2884 static void
2885 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2886                               pixman_op_t              op,
2887                               pixman_image_t *         src_image,
2888                               pixman_image_t *         mask_image,
2889                               pixman_image_t *         dest_image,
2890                               int32_t                  src_x,
2891                               int32_t                  src_y,
2892                               int32_t                  mask_x,
2893                               int32_t                  mask_y,
2894                               int32_t                  dest_x,
2895                               int32_t                  dest_y,
2896                               int32_t                  width,
2897                               int32_t                  height)
2898 {
2899     uint32_t    *dst_line, *dst;
2900     uint32_t    *src_line, *src;
2901     int32_t w;
2902     int dst_stride, src_stride;
2903
2904
2905     PIXMAN_IMAGE_GET_LINE (
2906         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2907     PIXMAN_IMAGE_GET_LINE (
2908         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2909
2910     while (height--)
2911     {
2912         dst = dst_line;
2913         dst_line += dst_stride;
2914         src = src_line;
2915         src_line += src_stride;
2916         w = width;
2917
2918         while (w && (unsigned long)dst & 15)
2919         {
2920             *dst++ = *src++ | 0xff000000;
2921             w--;
2922         }
2923
2924         while (w >= 16)
2925         {
2926             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2927             
2928             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2929             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2930             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2931             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2932             
2933             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2934             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2935             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2936             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2937             
2938             dst += 16;
2939             src += 16;
2940             w -= 16;
2941         }
2942
2943         while (w)
2944         {
2945             *dst++ = *src++ | 0xff000000;
2946             w--;
2947         }
2948     }
2949
2950 }
2951
2952 static void
2953 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2954                                  pixman_op_t              op,
2955                                  pixman_image_t *         src_image,
2956                                  pixman_image_t *         mask_image,
2957                                  pixman_image_t *         dest_image,
2958                                  int32_t                  src_x,
2959                                  int32_t                  src_y,
2960                                  int32_t                  mask_x,
2961                                  int32_t                  mask_y,
2962                                  int32_t                  dest_x,
2963                                  int32_t                  dest_y,
2964                                  int32_t                  width,
2965                                  int32_t                  height)
2966 {
2967     uint32_t    *dst_line, *dst;
2968     uint32_t    *src_line, *src;
2969     uint32_t mask;
2970     int dst_stride, src_stride;
2971     int32_t w;
2972
2973     __m128i xmm_mask, xmm_alpha;
2974     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2975     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2976
2977     PIXMAN_IMAGE_GET_LINE (
2978         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2979     PIXMAN_IMAGE_GET_LINE (
2980         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2981
2982     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2983
2984     xmm_mask = create_mask_16_128 (mask >> 24);
2985     xmm_alpha = mask_00ff;
2986
2987     while (height--)
2988     {
2989         dst = dst_line;
2990         dst_line += dst_stride;
2991         src = src_line;
2992         src_line += src_stride;
2993         w = width;
2994
2995         while (w && (unsigned long)dst & 15)
2996         {
2997             uint32_t s = (*src++) | 0xff000000;
2998             uint32_t d = *dst;
2999
3000             __m128i src   = unpack_32_1x128 (s);
3001             __m128i alpha = xmm_alpha;
3002             __m128i mask  = xmm_mask;
3003             __m128i dest  = unpack_32_1x128 (d);
3004
3005             *dst++ = pack_1x128_32 (
3006                 in_over_1x128 (&src, &alpha, &mask, &dest));
3007
3008             w--;
3009         }
3010
3011         while (w >= 4)
3012         {
3013             xmm_src = _mm_or_si128 (
3014                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3015             xmm_dst = load_128_aligned ((__m128i*)dst);
3016
3017             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3018             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3019
3020             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3021                            &xmm_alpha, &xmm_alpha,
3022                            &xmm_mask, &xmm_mask,
3023                            &xmm_dst_lo, &xmm_dst_hi);
3024
3025             save_128_aligned (
3026                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3027
3028             dst += 4;
3029             src += 4;
3030             w -= 4;
3031
3032         }
3033
3034         while (w)
3035         {
3036             uint32_t s = (*src++) | 0xff000000;
3037             uint32_t d = *dst;
3038
3039             __m128i src  = unpack_32_1x128 (s);
3040             __m128i alpha = xmm_alpha;
3041             __m128i mask  = xmm_mask;
3042             __m128i dest  = unpack_32_1x128 (d);
3043
3044             *dst++ = pack_1x128_32 (
3045                 in_over_1x128 (&src, &alpha, &mask, &dest));
3046
3047             w--;
3048         }
3049     }
3050
3051 }
3052
3053 static void
3054 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3055                                pixman_op_t              op,
3056                                pixman_image_t *         src_image,
3057                                pixman_image_t *         mask_image,
3058                                pixman_image_t *         dest_image,
3059                                int32_t                  src_x,
3060                                int32_t                  src_y,
3061                                int32_t                  mask_x,
3062                                int32_t                  mask_y,
3063                                int32_t                  dest_x,
3064                                int32_t                  dest_y,
3065                                int32_t                  width,
3066                                int32_t                  height)
3067 {
3068     int dst_stride, src_stride;
3069     uint32_t    *dst_line, *dst;
3070     uint32_t    *src_line, *src;
3071
3072     PIXMAN_IMAGE_GET_LINE (
3073         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3074     PIXMAN_IMAGE_GET_LINE (
3075         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3076
3077     dst = dst_line;
3078     src = src_line;
3079
3080     while (height--)
3081     {
3082         sse2_combine_over_u (imp, op, dst, src, NULL, width);
3083
3084         dst += dst_stride;
3085         src += src_stride;
3086     }
3087 }
3088
3089 static force_inline uint16_t
3090 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3091 {
3092     __m128i ms;
3093
3094     ms = unpack_32_1x128 (src);
3095     return pack_565_32_16 (
3096         pack_1x128_32 (
3097             over_1x128 (
3098                 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3099 }
3100
3101 static void
3102 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3103                                pixman_op_t              op,
3104                                pixman_image_t *         src_image,
3105                                pixman_image_t *         mask_image,
3106                                pixman_image_t *         dest_image,
3107                                int32_t                  src_x,
3108                                int32_t                  src_y,
3109                                int32_t                  mask_x,
3110                                int32_t                  mask_y,
3111                                int32_t                  dest_x,
3112                                int32_t                  dest_y,
3113                                int32_t                  width,
3114                                int32_t                  height)
3115 {
3116     uint16_t    *dst_line, *dst, d;
3117     uint32_t    *src_line, *src, s;
3118     int dst_stride, src_stride;
3119     int32_t w;
3120
3121     __m128i xmm_alpha_lo, xmm_alpha_hi;
3122     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3123     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3124
3125     PIXMAN_IMAGE_GET_LINE (
3126         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3127     PIXMAN_IMAGE_GET_LINE (
3128         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3129
3130     while (height--)
3131     {
3132         dst = dst_line;
3133         src = src_line;
3134
3135         dst_line += dst_stride;
3136         src_line += src_stride;
3137         w = width;
3138
3139         /* Align dst on a 16-byte boundary */
3140         while (w &&
3141                ((unsigned long)dst & 15))
3142         {
3143             s = *src++;
3144             d = *dst;
3145
3146             *dst++ = composite_over_8888_0565pixel (s, d);
3147             w--;
3148         }
3149
3150         /* It's a 8 pixel loop */
3151         while (w >= 8)
3152         {
3153             /* I'm loading unaligned because I'm not sure
3154              * about the address alignment.
3155              */
3156             xmm_src = load_128_unaligned ((__m128i*) src);
3157             xmm_dst = load_128_aligned ((__m128i*) dst);
3158
3159             /* Unpacking */
3160             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3161             unpack_565_128_4x128 (xmm_dst,
3162                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3163             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3164                                 &xmm_alpha_lo, &xmm_alpha_hi);
3165
3166             /* I'm loading next 4 pixels from memory
3167              * before to optimze the memory read.
3168              */
3169             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3170
3171             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3172                         &xmm_alpha_lo, &xmm_alpha_hi,
3173                         &xmm_dst0, &xmm_dst1);
3174
3175             /* Unpacking */
3176             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3177             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3178                                 &xmm_alpha_lo, &xmm_alpha_hi);
3179
3180             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3181                         &xmm_alpha_lo, &xmm_alpha_hi,
3182                         &xmm_dst2, &xmm_dst3);
3183
3184             save_128_aligned (
3185                 (__m128i*)dst, pack_565_4x128_128 (
3186                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3187
3188             w -= 8;
3189             dst += 8;
3190             src += 8;
3191         }
3192
3193         while (w--)
3194         {
3195             s = *src++;
3196             d = *dst;
3197
3198             *dst++ = composite_over_8888_0565pixel (s, d);
3199         }
3200     }
3201
3202 }
3203
3204 static void
3205 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3206                               pixman_op_t              op,
3207                               pixman_image_t *         src_image,
3208                               pixman_image_t *         mask_image,
3209                               pixman_image_t *         dest_image,
3210                               int32_t                  src_x,
3211                               int32_t                  src_y,
3212                               int32_t                  mask_x,
3213                               int32_t                  mask_y,
3214                               int32_t                  dest_x,
3215                               int32_t                  dest_y,
3216                               int32_t                  width,
3217                               int32_t                  height)
3218 {
3219     uint32_t src, srca;
3220     uint32_t *dst_line, *dst;
3221     uint8_t *mask_line, *mask;
3222     int dst_stride, mask_stride;
3223     int32_t w;
3224     uint32_t m, d;
3225
3226     __m128i xmm_src, xmm_alpha, xmm_def;
3227     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3228     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3229
3230     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3231
3232     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3233
3234     srca = src >> 24;
3235     if (src == 0)
3236         return;
3237
3238     PIXMAN_IMAGE_GET_LINE (
3239         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3240     PIXMAN_IMAGE_GET_LINE (
3241         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3242
3243     xmm_def = create_mask_2x32_128 (src, src);
3244     xmm_src = expand_pixel_32_1x128 (src);
3245     xmm_alpha = expand_alpha_1x128 (xmm_src);
3246     mmx_src   = xmm_src;
3247     mmx_alpha = xmm_alpha;
3248
3249     while (height--)
3250     {
3251         dst = dst_line;
3252         dst_line += dst_stride;
3253         mask = mask_line;
3254         mask_line += mask_stride;
3255         w = width;
3256
3257         while (w && (unsigned long)dst & 15)
3258         {
3259             uint8_t m = *mask++;
3260
3261             if (m)
3262             {
3263                 d = *dst;
3264                 mmx_mask = expand_pixel_8_1x128 (m);
3265                 mmx_dest = unpack_32_1x128 (d);
3266
3267                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3268                                                    &mmx_alpha,
3269                                                    &mmx_mask,
3270                                                    &mmx_dest));
3271             }
3272
3273             w--;
3274             dst++;
3275         }
3276
3277         while (w >= 4)
3278         {
3279             m = *((uint32_t*)mask);
3280
3281             if (srca == 0xff && m == 0xffffffff)
3282             {
3283                 save_128_aligned ((__m128i*)dst, xmm_def);
3284             }
3285             else if (m)
3286             {
3287                 xmm_dst = load_128_aligned ((__m128i*) dst);
3288                 xmm_mask = unpack_32_1x128 (m);
3289                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3290
3291                 /* Unpacking */
3292                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3293                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3294
3295                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3296                                         &xmm_mask_lo, &xmm_mask_hi);
3297
3298                 in_over_2x128 (&xmm_src, &xmm_src,
3299                                &xmm_alpha, &xmm_alpha,
3300                                &xmm_mask_lo, &xmm_mask_hi,
3301                                &xmm_dst_lo, &xmm_dst_hi);
3302
3303                 save_128_aligned (
3304                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3305             }
3306
3307             w -= 4;
3308             dst += 4;
3309             mask += 4;
3310         }
3311
3312         while (w)
3313         {
3314             uint8_t m = *mask++;
3315
3316             if (m)
3317             {
3318                 d = *dst;
3319                 mmx_mask = expand_pixel_8_1x128 (m);
3320                 mmx_dest = unpack_32_1x128 (d);
3321
3322                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3323                                                    &mmx_alpha,
3324                                                    &mmx_mask,
3325                                                    &mmx_dest));
3326             }
3327
3328             w--;
3329             dst++;
3330         }
3331     }
3332
3333 }
3334
3335 static pixman_bool_t
3336 pixman_fill_sse2 (uint32_t *bits,
3337                   int       stride,
3338                   int       bpp,
3339                   int       x,
3340                   int       y,
3341                   int       width,
3342                   int       height,
3343                   uint32_t  data)
3344 {
3345     uint32_t byte_width;
3346     uint8_t         *byte_line;
3347
3348     __m128i xmm_def;
3349
3350     if (bpp == 8)
3351     {
3352         uint8_t b;
3353         uint16_t w;
3354
3355         stride = stride * (int) sizeof (uint32_t) / 1;
3356         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3357         byte_width = width;
3358         stride *= 1;
3359
3360         b = data & 0xff;
3361         w = (b << 8) | b;
3362         data = (w << 16) | w;
3363     }
3364     else if (bpp == 16)
3365     {
3366         stride = stride * (int) sizeof (uint32_t) / 2;
3367         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3368         byte_width = 2 * width;
3369         stride *= 2;
3370
3371         data = (data & 0xffff) * 0x00010001;
3372     }
3373     else if (bpp == 32)
3374     {
3375         stride = stride * (int) sizeof (uint32_t) / 4;
3376         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3377         byte_width = 4 * width;
3378         stride *= 4;
3379     }
3380     else
3381     {
3382         return FALSE;
3383     }
3384
3385     xmm_def = create_mask_2x32_128 (data, data);
3386
3387     while (height--)
3388     {
3389         int w;
3390         uint8_t *d = byte_line;
3391         byte_line += stride;
3392         w = byte_width;
3393
3394         while (w >= 1 && ((unsigned long)d & 1))
3395         {
3396             *(uint8_t *)d = data;
3397             w -= 1;
3398             d += 1;
3399         }
3400
3401         while (w >= 2 && ((unsigned long)d & 3))
3402         {
3403             *(uint16_t *)d = data;
3404             w -= 2;
3405             d += 2;
3406         }
3407
3408         while (w >= 4 && ((unsigned long)d & 15))
3409         {
3410             *(uint32_t *)d = data;
3411
3412             w -= 4;
3413             d += 4;
3414         }
3415
3416         while (w >= 128)
3417         {
3418             save_128_aligned ((__m128i*)(d),     xmm_def);
3419             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3420             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3421             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3422             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3423             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3424             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3425             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3426
3427             d += 128;
3428             w -= 128;
3429         }
3430
3431         if (w >= 64)
3432         {
3433             save_128_aligned ((__m128i*)(d),     xmm_def);
3434             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3435             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3436             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3437
3438             d += 64;
3439             w -= 64;
3440         }
3441
3442         if (w >= 32)
3443         {
3444             save_128_aligned ((__m128i*)(d),     xmm_def);
3445             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3446
3447             d += 32;
3448             w -= 32;
3449         }
3450
3451         if (w >= 16)
3452         {
3453             save_128_aligned ((__m128i*)(d),     xmm_def);
3454
3455             d += 16;
3456             w -= 16;
3457         }
3458
3459         while (w >= 4)
3460         {
3461             *(uint32_t *)d = data;
3462
3463             w -= 4;
3464             d += 4;
3465         }
3466
3467         if (w >= 2)
3468         {
3469             *(uint16_t *)d = data;
3470             w -= 2;
3471             d += 2;
3472         }
3473
3474         if (w >= 1)
3475         {
3476             *(uint8_t *)d = data;
3477             w -= 1;
3478             d += 1;
3479         }
3480     }
3481
3482     return TRUE;
3483 }
3484
3485 static void
3486 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3487                              pixman_op_t              op,
3488                              pixman_image_t *         src_image,
3489                              pixman_image_t *         mask_image,
3490                              pixman_image_t *         dest_image,
3491                              int32_t                  src_x,
3492                              int32_t                  src_y,
3493                              int32_t                  mask_x,
3494                              int32_t                  mask_y,
3495                              int32_t                  dest_x,
3496                              int32_t                  dest_y,
3497                              int32_t                  width,
3498                              int32_t                  height)
3499 {
3500     uint32_t src, srca;
3501     uint32_t    *dst_line, *dst;
3502     uint8_t     *mask_line, *mask;
3503     int dst_stride, mask_stride;
3504     int32_t w;
3505     uint32_t m;
3506
3507     __m128i xmm_src, xmm_def;
3508     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3509
3510     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3511
3512     srca = src >> 24;
3513     if (src == 0)
3514     {
3515         pixman_fill_sse2 (dest_image->bits.bits, dest_image->bits.rowstride,
3516                           PIXMAN_FORMAT_BPP (dest_image->bits.format),
3517                           dest_x, dest_y, width, height, 0);
3518         return;
3519     }
3520
3521     PIXMAN_IMAGE_GET_LINE (
3522         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3523     PIXMAN_IMAGE_GET_LINE (
3524         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3525
3526     xmm_def = create_mask_2x32_128 (src, src);
3527     xmm_src = expand_pixel_32_1x128 (src);
3528
3529     while (height--)
3530     {
3531         dst = dst_line;
3532         dst_line += dst_stride;
3533         mask = mask_line;
3534         mask_line += mask_stride;
3535         w = width;
3536
3537         while (w && (unsigned long)dst & 15)
3538         {
3539             uint8_t m = *mask++;
3540
3541             if (m)
3542             {
3543                 *dst = pack_1x128_32 (
3544                     pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3545             }
3546             else
3547             {
3548                 *dst = 0;
3549             }
3550
3551             w--;
3552             dst++;
3553         }
3554
3555         while (w >= 4)
3556         {
3557             m = *((uint32_t*)mask);
3558
3559             if (srca == 0xff && m == 0xffffffff)
3560             {
3561                 save_128_aligned ((__m128i*)dst, xmm_def);
3562             }
3563             else if (m)
3564             {
3565                 xmm_mask = unpack_32_1x128 (m);
3566                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3567
3568                 /* Unpacking */
3569                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3570
3571                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3572                                         &xmm_mask_lo, &xmm_mask_hi);
3573
3574                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3575                                     &xmm_mask_lo, &xmm_mask_hi,
3576                                     &xmm_mask_lo, &xmm_mask_hi);
3577
3578                 save_128_aligned (
3579                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3580             }
3581             else
3582             {
3583                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3584             }
3585
3586             w -= 4;
3587             dst += 4;
3588             mask += 4;
3589         }
3590
3591         while (w)
3592         {
3593             uint8_t m = *mask++;
3594
3595             if (m)
3596             {
3597                 *dst = pack_1x128_32 (
3598                     pix_multiply_1x128 (
3599                         xmm_src, expand_pixel_8_1x128 (m)));
3600             }
3601             else
3602             {
3603                 *dst = 0;
3604             }
3605
3606             w--;
3607             dst++;
3608         }
3609     }
3610
3611 }
3612
3613 static void
3614 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3615                               pixman_op_t              op,
3616                               pixman_image_t *         src_image,
3617                               pixman_image_t *         mask_image,
3618                               pixman_image_t *         dest_image,
3619                               int32_t                  src_x,
3620                               int32_t                  src_y,
3621                               int32_t                  mask_x,
3622                               int32_t                  mask_y,
3623                               int32_t                  dest_x,
3624                               int32_t                  dest_y,
3625                               int32_t                  width,
3626                               int32_t                  height)
3627 {
3628     uint32_t src;
3629     uint16_t    *dst_line, *dst, d;
3630     uint8_t     *mask_line, *mask;
3631     int dst_stride, mask_stride;
3632     int32_t w;
3633     uint32_t m;
3634     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3635
3636     __m128i xmm_src, xmm_alpha;
3637     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3638     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3639
3640     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3641
3642     if (src == 0)
3643         return;
3644
3645     PIXMAN_IMAGE_GET_LINE (
3646         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3647     PIXMAN_IMAGE_GET_LINE (
3648         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3649
3650     xmm_src = expand_pixel_32_1x128 (src);
3651     xmm_alpha = expand_alpha_1x128 (xmm_src);
3652     mmx_src = xmm_src;
3653     mmx_alpha = xmm_alpha;
3654
3655     while (height--)
3656     {
3657         dst = dst_line;
3658         dst_line += dst_stride;
3659         mask = mask_line;
3660         mask_line += mask_stride;
3661         w = width;
3662
3663         while (w && (unsigned long)dst & 15)
3664         {
3665             m = *mask++;
3666
3667             if (m)
3668             {
3669                 d = *dst;
3670                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3671                 mmx_dest = expand565_16_1x128 (d);
3672
3673                 *dst = pack_565_32_16 (
3674                     pack_1x128_32 (
3675                         in_over_1x128 (
3676                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3677             }
3678
3679             w--;
3680             dst++;
3681         }
3682
3683         while (w >= 8)
3684         {
3685             xmm_dst = load_128_aligned ((__m128i*) dst);
3686             unpack_565_128_4x128 (xmm_dst,
3687                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3688
3689             m = *((uint32_t*)mask);
3690             mask += 4;
3691
3692             if (m)
3693             {
3694                 xmm_mask = unpack_32_1x128 (m);
3695                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3696
3697                 /* Unpacking */
3698                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3699
3700                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3701                                         &xmm_mask_lo, &xmm_mask_hi);
3702
3703                 in_over_2x128 (&xmm_src, &xmm_src,
3704                                &xmm_alpha, &xmm_alpha,
3705                                &xmm_mask_lo, &xmm_mask_hi,
3706                                &xmm_dst0, &xmm_dst1);
3707             }
3708
3709             m = *((uint32_t*)mask);
3710             mask += 4;
3711
3712             if (m)
3713             {
3714                 xmm_mask = unpack_32_1x128 (m);
3715                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3716
3717                 /* Unpacking */
3718                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3719
3720                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3721                                         &xmm_mask_lo, &xmm_mask_hi);
3722                 in_over_2x128 (&xmm_src, &xmm_src,
3723                                &xmm_alpha, &xmm_alpha,
3724                                &xmm_mask_lo, &xmm_mask_hi,
3725                                &xmm_dst2, &xmm_dst3);
3726             }
3727
3728             save_128_aligned (
3729                 (__m128i*)dst, pack_565_4x128_128 (
3730                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3731
3732             w -= 8;
3733             dst += 8;
3734         }
3735
3736         while (w)
3737         {
3738             m = *mask++;
3739
3740             if (m)
3741             {
3742                 d = *dst;
3743                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3744                 mmx_dest = expand565_16_1x128 (d);
3745
3746                 *dst = pack_565_32_16 (
3747                     pack_1x128_32 (
3748                         in_over_1x128 (
3749                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3750             }
3751
3752             w--;
3753             dst++;
3754         }
3755     }
3756
3757 }
3758
3759 static void
3760 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3761                                  pixman_op_t              op,
3762                                  pixman_image_t *         src_image,
3763                                  pixman_image_t *         mask_image,
3764                                  pixman_image_t *         dest_image,
3765                                  int32_t                  src_x,
3766                                  int32_t                  src_y,
3767                                  int32_t                  mask_x,
3768                                  int32_t                  mask_y,
3769                                  int32_t                  dest_x,
3770                                  int32_t                  dest_y,
3771                                  int32_t                  width,
3772                                  int32_t                  height)
3773 {
3774     uint16_t    *dst_line, *dst, d;
3775     uint32_t    *src_line, *src, s;
3776     int dst_stride, src_stride;
3777     int32_t w;
3778     uint32_t opaque, zero;
3779
3780     __m128i ms;
3781     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3782     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3783
3784     PIXMAN_IMAGE_GET_LINE (
3785         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3786     PIXMAN_IMAGE_GET_LINE (
3787         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3788
3789     while (height--)
3790     {
3791         dst = dst_line;
3792         dst_line += dst_stride;
3793         src = src_line;
3794         src_line += src_stride;
3795         w = width;
3796
3797         while (w && (unsigned long)dst & 15)
3798         {
3799             s = *src++;
3800             d = *dst;
3801
3802             ms = unpack_32_1x128 (s);
3803
3804             *dst++ = pack_565_32_16 (
3805                 pack_1x128_32 (
3806                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3807             w--;
3808         }
3809
3810         while (w >= 8)
3811         {
3812             /* First round */
3813             xmm_src = load_128_unaligned ((__m128i*)src);
3814             xmm_dst = load_128_aligned  ((__m128i*)dst);
3815
3816             opaque = is_opaque (xmm_src);
3817             zero = is_zero (xmm_src);
3818
3819             unpack_565_128_4x128 (xmm_dst,
3820                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3821             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3822
3823             /* preload next round*/
3824             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3825
3826             if (opaque)
3827             {
3828                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3829                                      &xmm_dst0, &xmm_dst1);
3830             }
3831             else if (!zero)
3832             {
3833                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3834                                         &xmm_dst0, &xmm_dst1);
3835             }
3836
3837             /* Second round */
3838             opaque = is_opaque (xmm_src);
3839             zero = is_zero (xmm_src);
3840
3841             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3842
3843             if (opaque)
3844             {
3845                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3846                                      &xmm_dst2, &xmm_dst3);
3847             }
3848             else if (!zero)
3849             {
3850                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3851                                         &xmm_dst2, &xmm_dst3);
3852             }
3853
3854             save_128_aligned (
3855                 (__m128i*)dst, pack_565_4x128_128 (
3856                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3857
3858             w -= 8;
3859             src += 8;
3860             dst += 8;
3861         }
3862
3863         while (w)
3864         {
3865             s = *src++;
3866             d = *dst;
3867
3868             ms = unpack_32_1x128 (s);
3869
3870             *dst++ = pack_565_32_16 (
3871                 pack_1x128_32 (
3872                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3873             w--;
3874         }
3875     }
3876
3877 }
3878
3879 static void
3880 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3881                                  pixman_op_t              op,
3882                                  pixman_image_t *         src_image,
3883                                  pixman_image_t *         mask_image,
3884                                  pixman_image_t *         dest_image,
3885                                  int32_t                  src_x,
3886                                  int32_t                  src_y,
3887                                  int32_t                  mask_x,
3888                                  int32_t                  mask_y,
3889                                  int32_t                  dest_x,
3890                                  int32_t                  dest_y,
3891                                  int32_t                  width,
3892                                  int32_t                  height)
3893 {
3894     uint32_t    *dst_line, *dst, d;
3895     uint32_t    *src_line, *src, s;
3896     int dst_stride, src_stride;
3897     int32_t w;
3898     uint32_t opaque, zero;
3899
3900     __m128i xmm_src_lo, xmm_src_hi;
3901     __m128i xmm_dst_lo, xmm_dst_hi;
3902
3903     PIXMAN_IMAGE_GET_LINE (
3904         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3905     PIXMAN_IMAGE_GET_LINE (
3906         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3907
3908     while (height--)
3909     {
3910         dst = dst_line;
3911         dst_line += dst_stride;
3912         src = src_line;
3913         src_line += src_stride;
3914         w = width;
3915
3916         while (w && (unsigned long)dst & 15)
3917         {
3918             s = *src++;
3919             d = *dst;
3920
3921             *dst++ = pack_1x128_32 (
3922                 over_rev_non_pre_1x128 (
3923                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3924
3925             w--;
3926         }
3927
3928         while (w >= 4)
3929         {
3930             xmm_src_hi = load_128_unaligned ((__m128i*)src);
3931
3932             opaque = is_opaque (xmm_src_hi);
3933             zero = is_zero (xmm_src_hi);
3934
3935             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3936
3937             if (opaque)
3938             {
3939                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3940                                      &xmm_dst_lo, &xmm_dst_hi);
3941
3942                 save_128_aligned (
3943                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3944             }
3945             else if (!zero)
3946             {
3947                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
3948
3949                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3950
3951                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3952                                         &xmm_dst_lo, &xmm_dst_hi);
3953
3954                 save_128_aligned (
3955                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3956             }
3957
3958             w -= 4;
3959             dst += 4;
3960             src += 4;
3961         }
3962
3963         while (w)
3964         {
3965             s = *src++;
3966             d = *dst;
3967
3968             *dst++ = pack_1x128_32 (
3969                 over_rev_non_pre_1x128 (
3970                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3971
3972             w--;
3973         }
3974     }
3975
3976 }
3977
3978 static void
3979 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3980                                     pixman_op_t              op,
3981                                     pixman_image_t *         src_image,
3982                                     pixman_image_t *         mask_image,
3983                                     pixman_image_t *         dest_image,
3984                                     int32_t                  src_x,
3985                                     int32_t                  src_y,
3986                                     int32_t                  mask_x,
3987                                     int32_t                  mask_y,
3988                                     int32_t                  dest_x,
3989                                     int32_t                  dest_y,
3990                                     int32_t                  width,
3991                                     int32_t                  height)
3992 {
3993     uint32_t src;
3994     uint16_t    *dst_line, *dst, d;
3995     uint32_t    *mask_line, *mask, m;
3996     int dst_stride, mask_stride;
3997     int w;
3998     uint32_t pack_cmp;
3999
4000     __m128i xmm_src, xmm_alpha;
4001     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4002     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4003
4004     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4005
4006     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4007
4008     if (src == 0)
4009         return;
4010
4011     PIXMAN_IMAGE_GET_LINE (
4012         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4013     PIXMAN_IMAGE_GET_LINE (
4014         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4015
4016     xmm_src = expand_pixel_32_1x128 (src);
4017     xmm_alpha = expand_alpha_1x128 (xmm_src);
4018     mmx_src = xmm_src;
4019     mmx_alpha = xmm_alpha;
4020
4021     while (height--)
4022     {
4023         w = width;
4024         mask = mask_line;
4025         dst = dst_line;
4026         mask_line += mask_stride;
4027         dst_line += dst_stride;
4028
4029         while (w && ((unsigned long)dst & 15))
4030         {
4031             m = *(uint32_t *) mask;
4032
4033             if (m)
4034             {
4035                 d = *dst;
4036                 mmx_mask = unpack_32_1x128 (m);
4037                 mmx_dest = expand565_16_1x128 (d);
4038
4039                 *dst = pack_565_32_16 (
4040                     pack_1x128_32 (
4041                         in_over_1x128 (
4042                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4043             }
4044
4045             w--;
4046             dst++;
4047             mask++;
4048         }
4049
4050         while (w >= 8)
4051         {
4052             /* First round */
4053             xmm_mask = load_128_unaligned ((__m128i*)mask);
4054             xmm_dst = load_128_aligned ((__m128i*)dst);
4055
4056             pack_cmp = _mm_movemask_epi8 (
4057                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4058
4059             unpack_565_128_4x128 (xmm_dst,
4060                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4061             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4062
4063             /* preload next round */
4064             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4065
4066             /* preload next round */
4067             if (pack_cmp != 0xffff)
4068             {
4069                 in_over_2x128 (&xmm_src, &xmm_src,
4070                                &xmm_alpha, &xmm_alpha,
4071                                &xmm_mask_lo, &xmm_mask_hi,
4072                                &xmm_dst0, &xmm_dst1);
4073             }
4074
4075             /* Second round */
4076             pack_cmp = _mm_movemask_epi8 (
4077                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4078
4079             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4080
4081             if (pack_cmp != 0xffff)
4082             {
4083                 in_over_2x128 (&xmm_src, &xmm_src,
4084                                &xmm_alpha, &xmm_alpha,
4085                                &xmm_mask_lo, &xmm_mask_hi,
4086                                &xmm_dst2, &xmm_dst3);
4087             }
4088
4089             save_128_aligned (
4090                 (__m128i*)dst, pack_565_4x128_128 (
4091                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4092
4093             w -= 8;
4094             dst += 8;
4095             mask += 8;
4096         }
4097
4098         while (w)
4099         {
4100             m = *(uint32_t *) mask;
4101
4102             if (m)
4103             {
4104                 d = *dst;
4105                 mmx_mask = unpack_32_1x128 (m);
4106                 mmx_dest = expand565_16_1x128 (d);
4107
4108                 *dst = pack_565_32_16 (
4109                     pack_1x128_32 (
4110                         in_over_1x128 (
4111                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4112             }
4113
4114             w--;
4115             dst++;
4116             mask++;
4117         }
4118     }
4119
4120 }
4121
4122 static void
4123 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4124                          pixman_op_t              op,
4125                          pixman_image_t *         src_image,
4126                          pixman_image_t *         mask_image,
4127                          pixman_image_t *         dest_image,
4128                          int32_t                  src_x,
4129                          int32_t                  src_y,
4130                          int32_t                  mask_x,
4131                          int32_t                  mask_y,
4132                          int32_t                  dest_x,
4133                          int32_t                  dest_y,
4134                          int32_t                  width,
4135                          int32_t                  height)
4136 {
4137     uint8_t     *dst_line, *dst;
4138     uint8_t     *mask_line, *mask;
4139     int dst_stride, mask_stride;
4140     uint32_t d, m;
4141     uint32_t src;
4142     int32_t w;
4143
4144     __m128i xmm_alpha;
4145     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4146     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4147
4148     PIXMAN_IMAGE_GET_LINE (
4149         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4150     PIXMAN_IMAGE_GET_LINE (
4151         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4152
4153     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4154
4155     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4156
4157     while (height--)
4158     {
4159         dst = dst_line;
4160         dst_line += dst_stride;
4161         mask = mask_line;
4162         mask_line += mask_stride;
4163         w = width;
4164
4165         while (w && ((unsigned long)dst & 15))
4166         {
4167             m = (uint32_t) *mask++;
4168             d = (uint32_t) *dst;
4169
4170             *dst++ = (uint8_t) pack_1x128_32 (
4171                 pix_multiply_1x128 (
4172                     pix_multiply_1x128 (xmm_alpha,
4173                                        unpack_32_1x128 (m)),
4174                     unpack_32_1x128 (d)));
4175             w--;
4176         }
4177
4178         while (w >= 16)
4179         {
4180             xmm_mask = load_128_unaligned ((__m128i*)mask);
4181             xmm_dst = load_128_aligned ((__m128i*)dst);
4182
4183             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4184             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4185
4186             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4187                                 &xmm_mask_lo, &xmm_mask_hi,
4188                                 &xmm_mask_lo, &xmm_mask_hi);
4189
4190             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4191                                 &xmm_dst_lo, &xmm_dst_hi,
4192                                 &xmm_dst_lo, &xmm_dst_hi);
4193
4194             save_128_aligned (
4195                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4196
4197             mask += 16;
4198             dst += 16;
4199             w -= 16;
4200         }
4201
4202         while (w)
4203         {
4204             m = (uint32_t) *mask++;
4205             d = (uint32_t) *dst;
4206
4207             *dst++ = (uint8_t) pack_1x128_32 (
4208                 pix_multiply_1x128 (
4209                     pix_multiply_1x128 (
4210                         xmm_alpha, unpack_32_1x128 (m)),
4211                     unpack_32_1x128 (d)));
4212             w--;
4213         }
4214     }
4215
4216 }
4217
4218 static void
4219 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4220                        pixman_op_t              op,
4221                        pixman_image_t *         src_image,
4222                        pixman_image_t *         mask_image,
4223                        pixman_image_t *         dest_image,
4224                        int32_t                  src_x,
4225                        int32_t                  src_y,
4226                        int32_t                  mask_x,
4227                        int32_t                  mask_y,
4228                        int32_t                  dest_x,
4229                        int32_t                  dest_y,
4230                        int32_t                  width,
4231                        int32_t                  height)
4232 {
4233     uint8_t     *dst_line, *dst;
4234     int dst_stride;
4235     uint32_t d;
4236     uint32_t src;
4237     int32_t w;
4238
4239     __m128i xmm_alpha;
4240     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4241
4242     PIXMAN_IMAGE_GET_LINE (
4243         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4244
4245     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4246
4247     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4248
4249     src = src >> 24;
4250
4251     if (src == 0xff)
4252         return;
4253
4254     if (src == 0x00)
4255     {
4256         pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4257                      8, dest_x, dest_y, width, height, src);
4258
4259         return;
4260     }
4261
4262     while (height--)
4263     {
4264         dst = dst_line;
4265         dst_line += dst_stride;
4266         w = width;
4267
4268         while (w && ((unsigned long)dst & 15))
4269         {
4270             d = (uint32_t) *dst;
4271
4272             *dst++ = (uint8_t) pack_1x128_32 (
4273                 pix_multiply_1x128 (
4274                     xmm_alpha,
4275                     unpack_32_1x128 (d)));
4276             w--;
4277         }
4278
4279         while (w >= 16)
4280         {
4281             xmm_dst = load_128_aligned ((__m128i*)dst);
4282
4283             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4284             
4285             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4286                                 &xmm_dst_lo, &xmm_dst_hi,
4287                                 &xmm_dst_lo, &xmm_dst_hi);
4288
4289             save_128_aligned (
4290                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4291
4292             dst += 16;
4293             w -= 16;
4294         }
4295
4296         while (w)
4297         {
4298             d = (uint32_t) *dst;
4299
4300             *dst++ = (uint8_t) pack_1x128_32 (
4301                 pix_multiply_1x128 (
4302                     xmm_alpha,
4303                     unpack_32_1x128 (d)));
4304             w--;
4305         }
4306     }
4307
4308 }
4309
4310 static void
4311 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4312                        pixman_op_t              op,
4313                        pixman_image_t *         src_image,
4314                        pixman_image_t *         mask_image,
4315                        pixman_image_t *         dest_image,
4316                        int32_t                  src_x,
4317                        int32_t                  src_y,
4318                        int32_t                  mask_x,
4319                        int32_t                  mask_y,
4320                        int32_t                  dest_x,
4321                        int32_t                  dest_y,
4322                        int32_t                  width,
4323                        int32_t                  height)
4324 {
4325     uint8_t     *dst_line, *dst;
4326     uint8_t     *src_line, *src;
4327     int src_stride, dst_stride;
4328     int32_t w;
4329     uint32_t s, d;
4330
4331     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4332     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4333
4334     PIXMAN_IMAGE_GET_LINE (
4335         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4336     PIXMAN_IMAGE_GET_LINE (
4337         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4338
4339     while (height--)
4340     {
4341         dst = dst_line;
4342         dst_line += dst_stride;
4343         src = src_line;
4344         src_line += src_stride;
4345         w = width;
4346
4347         while (w && ((unsigned long)dst & 15))
4348         {
4349             s = (uint32_t) *src++;
4350             d = (uint32_t) *dst;
4351
4352             *dst++ = (uint8_t) pack_1x128_32 (
4353                 pix_multiply_1x128 (
4354                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4355             w--;
4356         }
4357
4358         while (w >= 16)
4359         {
4360             xmm_src = load_128_unaligned ((__m128i*)src);
4361             xmm_dst = load_128_aligned ((__m128i*)dst);
4362
4363             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4364             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4365
4366             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4367                                 &xmm_dst_lo, &xmm_dst_hi,
4368                                 &xmm_dst_lo, &xmm_dst_hi);
4369
4370             save_128_aligned (
4371                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4372
4373             src += 16;
4374             dst += 16;
4375             w -= 16;
4376         }
4377
4378         while (w)
4379         {
4380             s = (uint32_t) *src++;
4381             d = (uint32_t) *dst;
4382
4383             *dst++ = (uint8_t) pack_1x128_32 (
4384                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4385             w--;
4386         }
4387     }
4388
4389 }
4390
4391 static void
4392 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4393                           pixman_op_t              op,
4394                           pixman_image_t *         src_image,
4395                           pixman_image_t *         mask_image,
4396                           pixman_image_t *         dest_image,
4397                           int32_t                  src_x,
4398                           int32_t                  src_y,
4399                           int32_t                  mask_x,
4400                           int32_t                  mask_y,
4401                           int32_t                  dest_x,
4402                           int32_t                  dest_y,
4403                           int32_t                  width,
4404                           int32_t                  height)
4405 {
4406     uint8_t     *dst_line, *dst;
4407     uint8_t     *mask_line, *mask;
4408     int dst_stride, mask_stride;
4409     int32_t w;
4410     uint32_t src;
4411     uint32_t m, d;
4412
4413     __m128i xmm_alpha;
4414     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4415     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4416
4417     PIXMAN_IMAGE_GET_LINE (
4418         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4419     PIXMAN_IMAGE_GET_LINE (
4420         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4421
4422     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4423
4424     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4425
4426     while (height--)
4427     {
4428         dst = dst_line;
4429         dst_line += dst_stride;
4430         mask = mask_line;
4431         mask_line += mask_stride;
4432         w = width;
4433
4434         while (w && ((unsigned long)dst & 15))
4435         {
4436             m = (uint32_t) *mask++;
4437             d = (uint32_t) *dst;
4438
4439             *dst++ = (uint8_t) pack_1x128_32 (
4440                 _mm_adds_epu16 (
4441                     pix_multiply_1x128 (
4442                         xmm_alpha, unpack_32_1x128 (m)),
4443                     unpack_32_1x128 (d)));
4444             w--;
4445         }
4446
4447         while (w >= 16)
4448         {
4449             xmm_mask = load_128_unaligned ((__m128i*)mask);
4450             xmm_dst = load_128_aligned ((__m128i*)dst);
4451
4452             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4453             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4454
4455             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4456                                 &xmm_mask_lo, &xmm_mask_hi,
4457                                 &xmm_mask_lo, &xmm_mask_hi);
4458
4459             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4460             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4461
4462             save_128_aligned (
4463                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4464
4465             mask += 16;
4466             dst += 16;
4467             w -= 16;
4468         }
4469
4470         while (w)
4471         {
4472             m = (uint32_t) *mask++;
4473             d = (uint32_t) *dst;
4474
4475             *dst++ = (uint8_t) pack_1x128_32 (
4476                 _mm_adds_epu16 (
4477                     pix_multiply_1x128 (
4478                         xmm_alpha, unpack_32_1x128 (m)),
4479                     unpack_32_1x128 (d)));
4480
4481             w--;
4482         }
4483     }
4484
4485 }
4486
4487 static void
4488 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4489                         pixman_op_t              op,
4490                         pixman_image_t *         src_image,
4491                         pixman_image_t *         mask_image,
4492                         pixman_image_t *         dest_image,
4493                         int32_t                  src_x,
4494                         int32_t                  src_y,
4495                         int32_t                  mask_x,
4496                         int32_t                  mask_y,
4497                         int32_t                  dest_x,
4498                         int32_t                  dest_y,
4499                         int32_t                  width,
4500                         int32_t                  height)
4501 {
4502     uint8_t     *dst_line, *dst;
4503     int dst_stride;
4504     int32_t w;
4505     uint32_t src;
4506
4507     __m128i xmm_src;
4508
4509     PIXMAN_IMAGE_GET_LINE (
4510         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4511
4512     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4513
4514     src >>= 24;
4515
4516     if (src == 0x00)
4517         return;
4518
4519     if (src == 0xff)
4520     {
4521         pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4522                      8, dest_x, dest_y, width, height, 0xff);
4523
4524         return;
4525     }
4526
4527     src = (src << 24) | (src << 16) | (src << 8) | src;
4528     xmm_src = _mm_set_epi32 (src, src, src, src);
4529
4530     while (height--)
4531     {
4532         dst = dst_line;
4533         dst_line += dst_stride;
4534         w = width;
4535
4536         while (w && ((unsigned long)dst & 15))
4537         {
4538             *dst = (uint8_t)_mm_cvtsi128_si32 (
4539                 _mm_adds_epu8 (
4540                     xmm_src,
4541                     _mm_cvtsi32_si128 (*dst)));
4542
4543             w--;
4544             dst++;
4545         }
4546
4547         while (w >= 16)
4548         {
4549             save_128_aligned (
4550                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4551
4552             dst += 16;
4553             w -= 16;
4554         }
4555
4556         while (w)
4557         {
4558             *dst = (uint8_t)_mm_cvtsi128_si32 (
4559                 _mm_adds_epu8 (
4560                     xmm_src,
4561                     _mm_cvtsi32_si128 (*dst)));
4562
4563             w--;
4564             dst++;
4565         }
4566     }
4567
4568 }
4569
4570 static void
4571 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4572                         pixman_op_t              op,
4573                         pixman_image_t *         src_image,
4574                         pixman_image_t *         mask_image,
4575                         pixman_image_t *         dest_image,
4576                         int32_t                  src_x,
4577                         int32_t                  src_y,
4578                         int32_t                  mask_x,
4579                         int32_t                  mask_y,
4580                         int32_t                  dest_x,
4581                         int32_t                  dest_y,
4582                         int32_t                  width,
4583                         int32_t                  height)
4584 {
4585     uint8_t     *dst_line, *dst;
4586     uint8_t     *src_line, *src;
4587     int dst_stride, src_stride;
4588     int32_t w;
4589     uint16_t t;
4590
4591     PIXMAN_IMAGE_GET_LINE (
4592         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4593     PIXMAN_IMAGE_GET_LINE (
4594         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4595
4596     while (height--)
4597     {
4598         dst = dst_line;
4599         src = src_line;
4600
4601         dst_line += dst_stride;
4602         src_line += src_stride;
4603         w = width;
4604
4605         /* Small head */
4606         while (w && (unsigned long)dst & 3)
4607         {
4608             t = (*dst) + (*src++);
4609             *dst++ = t | (0 - (t >> 8));
4610             w--;
4611         }
4612
4613         sse2_combine_add_u (imp, op,
4614                             (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4615
4616         /* Small tail */
4617         dst += w & 0xfffc;
4618         src += w & 0xfffc;
4619
4620         w &= 3;
4621
4622         while (w)
4623         {
4624             t = (*dst) + (*src++);
4625             *dst++ = t | (0 - (t >> 8));
4626             w--;
4627         }
4628     }
4629
4630 }
4631
4632 static void
4633 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4634                               pixman_op_t              op,
4635                               pixman_image_t *         src_image,
4636                               pixman_image_t *         mask_image,
4637                               pixman_image_t *         dest_image,
4638                               int32_t                  src_x,
4639                               int32_t                  src_y,
4640                               int32_t                  mask_x,
4641                               int32_t                  mask_y,
4642                               int32_t                  dest_x,
4643                               int32_t                  dest_y,
4644                               int32_t                  width,
4645                               int32_t                  height)
4646 {
4647     uint32_t    *dst_line, *dst;
4648     uint32_t    *src_line, *src;
4649     int dst_stride, src_stride;
4650
4651     PIXMAN_IMAGE_GET_LINE (
4652         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4653     PIXMAN_IMAGE_GET_LINE (
4654         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4655
4656     while (height--)
4657     {
4658         dst = dst_line;
4659         dst_line += dst_stride;
4660         src = src_line;
4661         src_line += src_stride;
4662
4663         sse2_combine_add_u (imp, op, dst, src, NULL, width);
4664     }
4665
4666 }
4667
4668 static pixman_bool_t
4669 pixman_blt_sse2 (uint32_t *src_bits,
4670                  uint32_t *dst_bits,
4671                  int       src_stride,
4672                  int       dst_stride,
4673                  int       src_bpp,
4674                  int       dst_bpp,
4675                  int       src_x,
4676                  int       src_y,
4677                  int       dest_x,
4678                  int       dest_y,
4679                  int       width,
4680                  int       height)
4681 {
4682     uint8_t *   src_bytes;
4683     uint8_t *   dst_bytes;
4684     int byte_width;
4685
4686     if (src_bpp != dst_bpp)
4687         return FALSE;
4688
4689     if (src_bpp == 16)
4690     {
4691         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4692         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4693         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4694         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4695         byte_width = 2 * width;
4696         src_stride *= 2;
4697         dst_stride *= 2;
4698     }
4699     else if (src_bpp == 32)
4700     {
4701         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4702         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4703         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4704         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4705         byte_width = 4 * width;
4706         src_stride *= 4;
4707         dst_stride *= 4;
4708     }
4709     else
4710     {
4711         return FALSE;
4712     }
4713
4714     while (height--)
4715     {
4716         int w;
4717         uint8_t *s = src_bytes;
4718         uint8_t *d = dst_bytes;
4719         src_bytes += src_stride;
4720         dst_bytes += dst_stride;
4721         w = byte_width;
4722
4723         while (w >= 2 && ((unsigned long)d & 3))
4724         {
4725             *(uint16_t *)d = *(uint16_t *)s;
4726             w -= 2;
4727             s += 2;
4728             d += 2;
4729         }
4730
4731         while (w >= 4 && ((unsigned long)d & 15))
4732         {
4733             *(uint32_t *)d = *(uint32_t *)s;
4734
4735             w -= 4;
4736             s += 4;
4737             d += 4;
4738         }
4739
4740         while (w >= 64)
4741         {
4742             __m128i xmm0, xmm1, xmm2, xmm3;
4743
4744             xmm0 = load_128_unaligned ((__m128i*)(s));
4745             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4746             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4747             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4748
4749             save_128_aligned ((__m128i*)(d),    xmm0);
4750             save_128_aligned ((__m128i*)(d + 16), xmm1);
4751             save_128_aligned ((__m128i*)(d + 32), xmm2);
4752             save_128_aligned ((__m128i*)(d + 48), xmm3);
4753
4754             s += 64;
4755             d += 64;
4756             w -= 64;
4757         }
4758
4759         while (w >= 16)
4760         {
4761             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4762
4763             w -= 16;
4764             d += 16;
4765             s += 16;
4766         }
4767
4768         while (w >= 4)
4769         {
4770             *(uint32_t *)d = *(uint32_t *)s;
4771
4772             w -= 4;
4773             s += 4;
4774             d += 4;
4775         }
4776
4777         if (w >= 2)
4778         {
4779             *(uint16_t *)d = *(uint16_t *)s;
4780             w -= 2;
4781             s += 2;
4782             d += 2;
4783         }
4784     }
4785
4786
4787     return TRUE;
4788 }
4789
4790 static void
4791 sse2_composite_copy_area (pixman_implementation_t *imp,
4792                           pixman_op_t              op,
4793                           pixman_image_t *         src_image,
4794                           pixman_image_t *         mask_image,
4795                           pixman_image_t *         dest_image,
4796                           int32_t                  src_x,
4797                           int32_t                  src_y,
4798                           int32_t                  mask_x,
4799                           int32_t                  mask_y,
4800                           int32_t                  dest_x,
4801                           int32_t                  dest_y,
4802                           int32_t                  width,
4803                           int32_t                  height)
4804 {
4805     pixman_blt_sse2 (src_image->bits.bits,
4806                      dest_image->bits.bits,
4807                      src_image->bits.rowstride,
4808                      dest_image->bits.rowstride,
4809                      PIXMAN_FORMAT_BPP (src_image->bits.format),
4810                      PIXMAN_FORMAT_BPP (dest_image->bits.format),
4811                      src_x, src_y, dest_x, dest_y, width, height);
4812 }
4813
4814 static void
4815 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4816                                  pixman_op_t              op,
4817                                  pixman_image_t *         src_image,
4818                                  pixman_image_t *         mask_image,
4819                                  pixman_image_t *         dest_image,
4820                                  int32_t                  src_x,
4821                                  int32_t                  src_y,
4822                                  int32_t                  mask_x,
4823                                  int32_t                  mask_y,
4824                                  int32_t                  dest_x,
4825                                  int32_t                  dest_y,
4826                                  int32_t                  width,
4827                                  int32_t                  height)
4828 {
4829     uint32_t    *src, *src_line, s;
4830     uint32_t    *dst, *dst_line, d;
4831     uint8_t         *mask, *mask_line;
4832     uint32_t m;
4833     int src_stride, mask_stride, dst_stride;
4834     int32_t w;
4835     __m128i ms;
4836
4837     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4838     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4839     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4840
4841     PIXMAN_IMAGE_GET_LINE (
4842         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4843     PIXMAN_IMAGE_GET_LINE (
4844         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4845     PIXMAN_IMAGE_GET_LINE (
4846         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4847
4848     while (height--)
4849     {
4850         src = src_line;
4851         src_line += src_stride;
4852         dst = dst_line;
4853         dst_line += dst_stride;
4854         mask = mask_line;
4855         mask_line += mask_stride;
4856
4857         w = width;
4858
4859         while (w && (unsigned long)dst & 15)
4860         {
4861             s = 0xff000000 | *src++;
4862             m = (uint32_t) *mask++;
4863             d = *dst;
4864             ms = unpack_32_1x128 (s);
4865
4866             if (m != 0xff)
4867             {
4868                 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4869                 __m128i md = unpack_32_1x128 (d);
4870
4871                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4872             }
4873
4874             *dst++ = pack_1x128_32 (ms);
4875             w--;
4876         }
4877
4878         while (w >= 4)
4879         {
4880             m = *(uint32_t*) mask;
4881             xmm_src = _mm_or_si128 (
4882                 load_128_unaligned ((__m128i*)src), mask_ff000000);
4883
4884             if (m == 0xffffffff)
4885             {
4886                 save_128_aligned ((__m128i*)dst, xmm_src);
4887             }
4888             else
4889             {
4890                 xmm_dst = load_128_aligned ((__m128i*)dst);
4891
4892                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4893
4894                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4895                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4896                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4897
4898                 expand_alpha_rev_2x128 (
4899                     xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4900
4901                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4902                                &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4903                                &xmm_dst_lo, &xmm_dst_hi);
4904
4905                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4906             }
4907
4908             src += 4;
4909             dst += 4;
4910             mask += 4;
4911             w -= 4;
4912         }
4913
4914         while (w)
4915         {
4916             m = (uint32_t) *mask++;
4917
4918             if (m)
4919             {
4920                 s = 0xff000000 | *src;
4921
4922                 if (m == 0xff)
4923                 {
4924                     *dst = s;
4925                 }
4926                 else
4927                 {
4928                     __m128i ma, md, ms;
4929
4930                     d = *dst;
4931
4932                     ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4933                     md = unpack_32_1x128 (d);
4934                     ms = unpack_32_1x128 (s);
4935
4936                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4937                 }
4938
4939             }
4940
4941             src++;
4942             dst++;
4943             w--;
4944         }
4945     }
4946
4947 }
4948
4949 static void
4950 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4951                                  pixman_op_t              op,
4952                                  pixman_image_t *         src_image,
4953                                  pixman_image_t *         mask_image,
4954                                  pixman_image_t *         dest_image,
4955                                  int32_t                  src_x,
4956                                  int32_t                  src_y,
4957                                  int32_t                  mask_x,
4958                                  int32_t                  mask_y,
4959                                  int32_t                  dest_x,
4960                                  int32_t                  dest_y,
4961                                  int32_t                  width,
4962                                  int32_t                  height)
4963 {
4964     uint32_t    *src, *src_line, s;
4965     uint32_t    *dst, *dst_line, d;
4966     uint8_t         *mask, *mask_line;
4967     uint32_t m;
4968     int src_stride, mask_stride, dst_stride;
4969     int32_t w;
4970
4971     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4972     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4973     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4974
4975     PIXMAN_IMAGE_GET_LINE (
4976         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4977     PIXMAN_IMAGE_GET_LINE (
4978         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4979     PIXMAN_IMAGE_GET_LINE (
4980         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4981
4982     while (height--)
4983     {
4984         src = src_line;
4985         src_line += src_stride;
4986         dst = dst_line;
4987         dst_line += dst_stride;
4988         mask = mask_line;
4989         mask_line += mask_stride;
4990
4991         w = width;
4992
4993         while (w && (unsigned long)dst & 15)
4994         {
4995             uint32_t sa;
4996
4997             s = *src++;
4998             m = (uint32_t) *mask++;
4999             d = *dst;
5000
5001             sa = s >> 24;
5002
5003             if (m)
5004             {
5005                 if (sa == 0xff && m == 0xff)
5006                 {
5007                     *dst = s;
5008                 }
5009                 else
5010                 {
5011                     __m128i ms, md, ma, msa;
5012
5013                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5014                     ms = unpack_32_1x128 (s);
5015                     md = unpack_32_1x128 (d);
5016
5017                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5018
5019                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5020                 }
5021             }
5022
5023             dst++;
5024             w--;
5025         }
5026
5027         while (w >= 4)
5028         {
5029             m = *(uint32_t *) mask;
5030
5031             if (m)
5032             {
5033                 xmm_src = load_128_unaligned ((__m128i*)src);
5034
5035                 if (m == 0xffffffff && is_opaque (xmm_src))
5036                 {
5037                     save_128_aligned ((__m128i *)dst, xmm_src);
5038                 }
5039                 else
5040                 {
5041                     xmm_dst = load_128_aligned ((__m128i *)dst);
5042
5043                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5044
5045                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5046                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5047                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5048
5049                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5050                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5051
5052                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5053                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5054
5055                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5056                 }
5057             }
5058
5059             src += 4;
5060             dst += 4;
5061             mask += 4;
5062             w -= 4;
5063         }
5064
5065         while (w)
5066         {
5067             uint32_t sa;
5068
5069             s = *src++;
5070             m = (uint32_t) *mask++;
5071             d = *dst;
5072
5073             sa = s >> 24;
5074
5075             if (m)
5076             {
5077                 if (sa == 0xff && m == 0xff)
5078                 {
5079                     *dst = s;
5080                 }
5081                 else
5082                 {
5083                     __m128i ms, md, ma, msa;
5084
5085                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5086                     ms = unpack_32_1x128 (s);
5087                     md = unpack_32_1x128 (d);
5088
5089                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5090
5091                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5092                 }
5093             }
5094
5095             dst++;
5096             w--;
5097         }
5098     }
5099
5100 }
5101
5102 static void
5103 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5104                                     pixman_op_t              op,
5105                                     pixman_image_t *         src_image,
5106                                     pixman_image_t *         mask_image,
5107                                     pixman_image_t *         dest_image,
5108                                     int32_t                  src_x,
5109                                     int32_t                  src_y,
5110                                     int32_t                  mask_x,
5111                                     int32_t                  mask_y,
5112                                     int32_t                  dest_x,
5113                                     int32_t                  dest_y,
5114                                     int32_t                  width,
5115                                     int32_t                  height)
5116 {
5117     uint32_t src;
5118     uint32_t    *dst_line, *dst;
5119     __m128i xmm_src;
5120     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5121     __m128i xmm_dsta_hi, xmm_dsta_lo;
5122     int dst_stride;
5123     int32_t w;
5124
5125     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5126
5127     if (src == 0)
5128         return;
5129
5130     PIXMAN_IMAGE_GET_LINE (
5131         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5132
5133     xmm_src = expand_pixel_32_1x128 (src);
5134
5135     while (height--)
5136     {
5137         dst = dst_line;
5138
5139         dst_line += dst_stride;
5140         w = width;
5141
5142         while (w && (unsigned long)dst & 15)
5143         {
5144             __m128i vd;
5145
5146             vd = unpack_32_1x128 (*dst);
5147
5148             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5149                                               xmm_src));
5150             w--;
5151             dst++;
5152         }
5153
5154         while (w >= 4)
5155         {
5156             __m128i tmp_lo, tmp_hi;
5157
5158             xmm_dst = load_128_aligned ((__m128i*)dst);
5159
5160             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5161             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5162
5163             tmp_lo = xmm_src;
5164             tmp_hi = xmm_src;
5165
5166             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5167                         &xmm_dsta_lo, &xmm_dsta_hi,
5168                         &tmp_lo, &tmp_hi);
5169
5170             save_128_aligned (
5171                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5172
5173             w -= 4;
5174             dst += 4;
5175         }
5176
5177         while (w)
5178         {
5179             __m128i vd;
5180
5181             vd = unpack_32_1x128 (*dst);
5182
5183             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5184                                               xmm_src));
5185             w--;
5186             dst++;
5187         }
5188
5189     }
5190
5191 }
5192
5193 static void
5194 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5195                                     pixman_op_t              op,
5196                                     pixman_image_t *         src_image,
5197                                     pixman_image_t *         mask_image,
5198                                     pixman_image_t *         dest_image,
5199                                     int32_t                  src_x,
5200                                     int32_t                  src_y,
5201                                     int32_t                  mask_x,
5202                                     int32_t                  mask_y,
5203                                     int32_t                  dest_x,
5204                                     int32_t                  dest_y,
5205                                     int32_t                  width,
5206                                     int32_t                  height)
5207 {
5208     uint32_t    *src, *src_line, s;
5209     uint32_t    *dst, *dst_line, d;
5210     uint32_t    *mask, *mask_line;
5211     uint32_t    m;
5212     int src_stride, mask_stride, dst_stride;
5213     int32_t w;
5214
5215     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5216     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5217     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5218
5219     PIXMAN_IMAGE_GET_LINE (
5220         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5221     PIXMAN_IMAGE_GET_LINE (
5222         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5223     PIXMAN_IMAGE_GET_LINE (
5224         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5225
5226     while (height--)
5227     {
5228         src = src_line;
5229         src_line += src_stride;
5230         dst = dst_line;
5231         dst_line += dst_stride;
5232         mask = mask_line;
5233         mask_line += mask_stride;
5234
5235         w = width;
5236
5237         while (w && (unsigned long)dst & 15)
5238         {
5239             uint32_t sa;
5240
5241             s = *src++;
5242             m = (*mask++) >> 24;
5243             d = *dst;
5244
5245             sa = s >> 24;
5246
5247             if (m)
5248             {
5249                 if (sa == 0xff && m == 0xff)
5250                 {
5251                     *dst = s;
5252                 }
5253                 else
5254                 {
5255                     __m128i ms, md, ma, msa;
5256
5257                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5258                     ms = unpack_32_1x128 (s);
5259                     md = unpack_32_1x128 (d);
5260
5261                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5262
5263                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5264                 }
5265             }
5266
5267             dst++;
5268             w--;
5269         }
5270
5271         while (w >= 4)
5272         {
5273             xmm_mask = load_128_unaligned ((__m128i*)mask);
5274
5275             if (!is_transparent (xmm_mask))
5276             {
5277                 xmm_src = load_128_unaligned ((__m128i*)src);
5278
5279                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5280                 {
5281                     save_128_aligned ((__m128i *)dst, xmm_src);
5282                 }
5283                 else
5284                 {
5285                     xmm_dst = load_128_aligned ((__m128i *)dst);
5286
5287                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5288                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5289                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5290
5291                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5292                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5293
5294                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5295                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5296
5297                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5298                 }
5299             }
5300
5301             src += 4;
5302             dst += 4;
5303             mask += 4;
5304             w -= 4;
5305         }
5306
5307         while (w)
5308         {
5309             uint32_t sa;
5310
5311             s = *src++;
5312             m = (*mask++) >> 24;
5313             d = *dst;
5314
5315             sa = s >> 24;
5316
5317             if (m)
5318             {
5319                 if (sa == 0xff && m == 0xff)
5320                 {
5321                     *dst = s;
5322                 }
5323                 else
5324                 {
5325                     __m128i ms, md, ma, msa;
5326
5327                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5328                     ms = unpack_32_1x128 (s);
5329                     md = unpack_32_1x128 (d);
5330
5331                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5332
5333                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5334                 }
5335             }
5336
5337             dst++;
5338             w--;
5339         }
5340     }
5341
5342 }
5343
5344 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5345 static force_inline void
5346 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5347                                              const uint32_t* ps,
5348                                              int32_t         w,
5349                                              pixman_fixed_t  vx,
5350                                              pixman_fixed_t  unit_x,
5351                                              pixman_fixed_t  max_vx,
5352                                              pixman_bool_t   fully_transparent_src)
5353 {
5354     uint32_t s, d;
5355     const uint32_t* pm = NULL;
5356
5357     __m128i xmm_dst_lo, xmm_dst_hi;
5358     __m128i xmm_src_lo, xmm_src_hi;
5359     __m128i xmm_alpha_lo, xmm_alpha_hi;
5360
5361     if (fully_transparent_src)
5362         return;
5363
5364     /* Align dst on a 16-byte boundary */
5365     while (w && ((unsigned long)pd & 15))
5366     {
5367         d = *pd;
5368         s = combine1 (ps + (vx >> 16), pm);
5369         vx += unit_x;
5370
5371         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5372         if (pm)
5373             pm++;
5374         w--;
5375     }
5376
5377     while (w >= 4)
5378     {
5379         __m128i tmp;
5380         uint32_t tmp1, tmp2, tmp3, tmp4;
5381
5382         tmp1 = ps[vx >> 16];
5383         vx += unit_x;
5384         tmp2 = ps[vx >> 16];
5385         vx += unit_x;
5386         tmp3 = ps[vx >> 16];
5387         vx += unit_x;
5388         tmp4 = ps[vx >> 16];
5389         vx += unit_x;
5390
5391         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5392
5393         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5394
5395         if (is_opaque (xmm_src_hi))
5396         {
5397             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5398         }
5399         else if (!is_zero (xmm_src_hi))
5400         {
5401             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5402
5403             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5404             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5405
5406             expand_alpha_2x128 (
5407                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5408
5409             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5410                         &xmm_alpha_lo, &xmm_alpha_hi,
5411                         &xmm_dst_lo, &xmm_dst_hi);
5412
5413             /* rebuid the 4 pixel data and save*/
5414             save_128_aligned ((__m128i*)pd,
5415                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5416         }
5417
5418         w -= 4;
5419         pd += 4;
5420         if (pm)
5421             pm += 4;
5422     }
5423
5424     while (w)
5425     {
5426         d = *pd;
5427         s = combine1 (ps + (vx >> 16), pm);
5428         vx += unit_x;
5429
5430         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5431         if (pm)
5432             pm++;
5433
5434         w--;
5435     }
5436 }
5437
5438 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5439                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5440                        uint32_t, uint32_t, COVER)
5441 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5442                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5443                        uint32_t, uint32_t, NONE)
5444 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5445                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5446                        uint32_t, uint32_t, PAD)
5447
5448 static force_inline void
5449 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5450                                                uint32_t *       dst,
5451                                                const uint32_t * src,
5452                                                int32_t          w,
5453                                                pixman_fixed_t   vx,
5454                                                pixman_fixed_t   unit_x,
5455                                                pixman_fixed_t   max_vx,
5456                                                pixman_bool_t    zero_src)
5457 {
5458     __m128i xmm_mask;
5459     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5460     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5461     __m128i xmm_alpha_lo, xmm_alpha_hi;
5462
5463     if (zero_src || (*mask >> 24) == 0)
5464         return;
5465
5466     xmm_mask = create_mask_16_128 (*mask >> 24);
5467
5468     while (w && (unsigned long)dst & 15)
5469     {
5470         uint32_t s = src[pixman_fixed_to_int (vx)];
5471         vx += unit_x;
5472
5473         if (s)
5474         {
5475             uint32_t d = *dst;
5476
5477             __m128i ms = unpack_32_1x128 (s);
5478             __m128i alpha     = expand_alpha_1x128 (ms);
5479             __m128i dest      = xmm_mask;
5480             __m128i alpha_dst = unpack_32_1x128 (d);
5481
5482             *dst = pack_1x128_32 (
5483                 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5484         }
5485         dst++;
5486         w--;
5487     }
5488
5489     while (w >= 4)
5490     {
5491         uint32_t tmp1, tmp2, tmp3, tmp4;
5492
5493         tmp1 = src[pixman_fixed_to_int (vx)];
5494         vx += unit_x;
5495         tmp2 = src[pixman_fixed_to_int (vx)];
5496         vx += unit_x;
5497         tmp3 = src[pixman_fixed_to_int (vx)];
5498         vx += unit_x;
5499         tmp4 = src[pixman_fixed_to_int (vx)];
5500         vx += unit_x;
5501
5502         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5503
5504         if (!is_zero (xmm_src))
5505         {
5506             xmm_dst = load_128_aligned ((__m128i*)dst);
5507
5508             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5509             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5510             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5511                                 &xmm_alpha_lo, &xmm_alpha_hi);
5512
5513             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5514                            &xmm_alpha_lo, &xmm_alpha_hi,
5515                            &xmm_mask, &xmm_mask,
5516                            &xmm_dst_lo, &xmm_dst_hi);
5517
5518             save_128_aligned (
5519                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5520         }
5521
5522         dst += 4;
5523         w -= 4;
5524     }
5525
5526     while (w)
5527     {
5528         uint32_t s = src[pixman_fixed_to_int (vx)];
5529         vx += unit_x;
5530
5531         if (s)
5532         {
5533             uint32_t d = *dst;
5534
5535             __m128i ms = unpack_32_1x128 (s);
5536             __m128i alpha = expand_alpha_1x128 (ms);
5537             __m128i mask  = xmm_mask;
5538             __m128i dest  = unpack_32_1x128 (d);
5539
5540             *dst = pack_1x128_32 (
5541                 in_over_1x128 (&ms, &alpha, &mask, &dest));
5542         }
5543
5544         dst++;
5545         w--;
5546     }
5547
5548 }
5549
5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5551                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5552                               uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5554                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5555                               uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5556 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5557                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5558                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5559
5560 static void
5561 bilinear_interpolate_line_sse2 (uint32_t *       out,
5562                                 const uint32_t * top,
5563                                 const uint32_t * bottom,
5564                                 int              wt,
5565                                 int              wb,
5566                                 pixman_fixed_t   x,
5567                                 pixman_fixed_t   ux,
5568                                 int              width)
5569 {
5570     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
5571     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
5572     const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
5573     const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
5574     const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
5575     const __m128i xmm_zero = _mm_setzero_si128 ();
5576     __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
5577     uint32_t pix1, pix2, pix3, pix4;
5578
5579     #define INTERPOLATE_ONE_PIXEL(pix)                                          \
5580     do {                                                                        \
5581         __m128i xmm_wh, xmm_lo, xmm_hi, a;                                      \
5582         /* fetch 2x2 pixel block into sse2 register */                          \
5583         uint32_t tl = top [pixman_fixed_to_int (x)];                            \
5584         uint32_t tr = top [pixman_fixed_to_int (x) + 1];                        \
5585         uint32_t bl = bottom [pixman_fixed_to_int (x)];                         \
5586         uint32_t br = bottom [pixman_fixed_to_int (x) + 1];                     \
5587         a = _mm_set_epi32 (tr, tl, br, bl);                                     \
5588         x += ux;                                                                \
5589         /* vertical interpolation */                                            \
5590         a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),    \
5591                                             xmm_wt),                            \
5592                            _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),    \
5593                                             xmm_wb));                           \
5594         /* calculate horizontal weights */                                      \
5595         xmm_wh = _mm_add_epi16 (xmm_addc,                                       \
5596                                 _mm_xor_si128 (xmm_xorc,                        \
5597                                                _mm_srli_epi16 (xmm_x, 8)));     \
5598         xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                  \
5599         /* horizontal interpolation */                                          \
5600         xmm_lo = _mm_mullo_epi16 (a, xmm_wh);                                   \
5601         xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);                                   \
5602         a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),                 \
5603                            _mm_unpackhi_epi16 (xmm_lo, xmm_hi));                \
5604         /* shift and pack the result */                                         \
5605         a = _mm_srli_epi32 (a, 16);                                             \
5606         a = _mm_packs_epi32 (a, a);                                             \
5607         a = _mm_packus_epi16 (a, a);                                            \
5608         pix = _mm_cvtsi128_si32 (a);                                            \
5609     } while (0)
5610
5611     while ((width -= 4) >= 0)
5612     {
5613         INTERPOLATE_ONE_PIXEL (pix1);
5614         INTERPOLATE_ONE_PIXEL (pix2);
5615         INTERPOLATE_ONE_PIXEL (pix3);
5616         INTERPOLATE_ONE_PIXEL (pix4);
5617         *out++ = pix1;
5618         *out++ = pix2;
5619         *out++ = pix3;
5620         *out++ = pix4;
5621     }
5622     if (width & 2)
5623     {
5624         INTERPOLATE_ONE_PIXEL (pix1);
5625         INTERPOLATE_ONE_PIXEL (pix2);
5626         *out++ = pix1;
5627         *out++ = pix2;
5628     }
5629     if (width & 1)
5630     {
5631         INTERPOLATE_ONE_PIXEL (pix1);
5632         *out = pix1;
5633     }
5634
5635     #undef INTERPOLATE_ONE_PIXEL
5636 }
5637
5638 static force_inline void
5639 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
5640                                              const uint32_t * mask,
5641                                              const uint32_t * src_top,
5642                                              const uint32_t * src_bottom,
5643                                              int32_t          w,
5644                                              int              wt,
5645                                              int              wb,
5646                                              pixman_fixed_t   vx,
5647                                              pixman_fixed_t   unit_x,
5648                                              pixman_fixed_t   max_vx,
5649                                              pixman_bool_t    zero_src)
5650 {
5651     bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
5652                                     wt, wb, vx, unit_x, w);
5653 }
5654
5655 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5656                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5657                                uint32_t, uint32_t, uint32_t,
5658                                COVER, FALSE, FALSE)
5659 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5660                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5661                                uint32_t, uint32_t, uint32_t,
5662                                PAD, FALSE, FALSE)
5663 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5664                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5665                                uint32_t, uint32_t, uint32_t,
5666                                NONE, FALSE, FALSE)
5667
5668 static const pixman_fast_path_t sse2_fast_paths[] =
5669 {
5670     /* PIXMAN_OP_OVER */
5671     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5672     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5673     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5674     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5675     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5676     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5677     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5678     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5679     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5680     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5681     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5682     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5683     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5684     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5685     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5686     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5687     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5688     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5689     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5690     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5691     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5692     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5693     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5694     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5695     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5696     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5697     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5698     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5699     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5700     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5701     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5702     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5703     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5704     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5705     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5706     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5707     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5708     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5709     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5710     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5711     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5712     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5713     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5714     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5715     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5716     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5717     
5718     /* PIXMAN_OP_OVER_REVERSE */
5719     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5720     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5721
5722     /* PIXMAN_OP_ADD */
5723     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5724     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5725     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5726     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5727     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5728     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5729
5730     /* PIXMAN_OP_SRC */
5731     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5732     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5733     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5734     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5735     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5736     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5737     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5738     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5739     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5740     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5741     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5742     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5743     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5744     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5745
5746     /* PIXMAN_OP_IN */
5747     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5748     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5749     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5750
5751     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5752     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5753     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5754     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5755     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5756     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5757     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5758     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5759     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5760     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5761     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5762     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5763
5764     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
5765     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
5766     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
5767     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
5768
5769     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5770     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5771     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
5772
5773     { PIXMAN_OP_NONE },
5774 };
5775
5776 static pixman_bool_t
5777 sse2_blt (pixman_implementation_t *imp,
5778           uint32_t *               src_bits,
5779           uint32_t *               dst_bits,
5780           int                      src_stride,
5781           int                      dst_stride,
5782           int                      src_bpp,
5783           int                      dst_bpp,
5784           int                      src_x,
5785           int                      src_y,
5786           int                      dest_x,
5787           int                      dest_y,
5788           int                      width,
5789           int                      height)
5790 {
5791     if (!pixman_blt_sse2 (
5792             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5793             src_x, src_y, dest_x, dest_y, width, height))
5794
5795     {
5796         return _pixman_implementation_blt (
5797             imp->delegate,
5798             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5799             src_x, src_y, dest_x, dest_y, width, height);
5800     }
5801
5802     return TRUE;
5803 }
5804
5805 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5806 __attribute__((__force_align_arg_pointer__))
5807 #endif
5808 static pixman_bool_t
5809 sse2_fill (pixman_implementation_t *imp,
5810            uint32_t *               bits,
5811            int                      stride,
5812            int                      bpp,
5813            int                      x,
5814            int                      y,
5815            int                      width,
5816            int                      height,
5817            uint32_t xor)
5818 {
5819     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5820     {
5821         return _pixman_implementation_fill (
5822             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5823     }
5824
5825     return TRUE;
5826 }
5827
5828 static uint32_t *
5829 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5830 {
5831     int w = iter->width;
5832     __m128i ff000000 = mask_ff000000;
5833     uint32_t *dst = iter->buffer;
5834     uint32_t *src = (uint32_t *)iter->bits;
5835
5836     iter->bits += iter->stride;
5837
5838     while (w && ((unsigned long)dst) & 0x0f)
5839     {
5840         *dst++ = (*src++) | 0xff000000;
5841         w--;
5842     }
5843
5844     while (w >= 4)
5845     {
5846         save_128_aligned (
5847             (__m128i *)dst, _mm_or_si128 (
5848                 load_128_unaligned ((__m128i *)src), ff000000));
5849
5850         dst += 4;
5851         src += 4;
5852         w -= 4;
5853     }
5854
5855     while (w)
5856     {
5857         *dst++ = (*src++) | 0xff000000;
5858         w--;
5859     }
5860
5861     return iter->buffer;
5862 }
5863
5864 static uint32_t *
5865 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
5866 {
5867     int w = iter->width;
5868     uint32_t *dst = iter->buffer;
5869     uint16_t *src = (uint16_t *)iter->bits;
5870     __m128i ff000000 = mask_ff000000;
5871
5872     iter->bits += iter->stride;
5873
5874     while (w && ((unsigned long)dst) & 0x0f)
5875     {
5876         uint16_t s = *src++;
5877
5878         *dst++ = CONVERT_0565_TO_8888 (s);
5879         w--;
5880     }
5881
5882     while (w >= 8)
5883     {
5884         __m128i lo, hi, s;
5885
5886         s = _mm_loadu_si128 ((__m128i *)src);
5887
5888         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
5889         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
5890
5891         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
5892         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
5893
5894         dst += 8;
5895         src += 8;
5896         w -= 8;
5897     }
5898
5899     while (w)
5900     {
5901         uint16_t s = *src++;
5902
5903         *dst++ = CONVERT_0565_TO_8888 (s);
5904         w--;
5905     }
5906
5907     return iter->buffer;
5908 }
5909
5910 static uint32_t *
5911 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5912 {
5913     int w = iter->width;
5914     uint32_t *dst = iter->buffer;
5915     uint8_t *src = iter->bits;
5916     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5917
5918     iter->bits += iter->stride;
5919
5920     while (w && (((unsigned long)dst) & 15))
5921     {
5922         *dst++ = *(src++) << 24;
5923         w--;
5924     }
5925
5926     while (w >= 16)
5927     {
5928         xmm0 = _mm_loadu_si128((__m128i *)src);
5929
5930         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
5931         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
5932         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
5933         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
5934         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
5935         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
5936
5937         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
5938         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
5939         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
5940         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
5941
5942         dst += 16;
5943         src += 16;
5944         w -= 16;
5945     }
5946
5947     while (w)
5948     {
5949         *dst++ = *(src++) << 24;
5950         w--;
5951     }
5952
5953     return iter->buffer;
5954 }
5955
5956 typedef struct
5957 {
5958     pixman_format_code_t        format;
5959     pixman_iter_get_scanline_t  get_scanline;
5960 } fetcher_info_t;
5961
5962 static const fetcher_info_t fetchers[] =
5963 {
5964     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
5965     { PIXMAN_r5g6b5,            sse2_fetch_r5g6b5 },
5966     { PIXMAN_a8,                sse2_fetch_a8 },
5967     { PIXMAN_null }
5968 };
5969
5970 static void
5971 sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
5972 {
5973     pixman_image_t *image = iter->image;
5974     int x = iter->x;
5975     int y = iter->y;
5976     int width = iter->width;
5977     int height = iter->height;
5978
5979 #define FLAGS                                                           \
5980     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
5981
5982     if ((iter->flags & ITER_NARROW)                             &&
5983         (image->common.flags & FLAGS) == FLAGS                  &&
5984         x >= 0 && y >= 0                                        &&
5985         x + width <= image->bits.width                          &&
5986         y + height <= image->bits.height)
5987     {
5988         const fetcher_info_t *f;
5989
5990         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
5991         {
5992             if (image->common.extended_format_code == f->format)
5993             {
5994                 uint8_t *b = (uint8_t *)image->bits.bits;
5995                 int s = image->bits.rowstride * 4;
5996
5997                 iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
5998                 iter->stride = s;
5999
6000                 iter->get_scanline = f->get_scanline;
6001                 return;
6002             }
6003         }
6004     }
6005
6006     imp->delegate->src_iter_init (imp->delegate, iter);
6007 }
6008
6009 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6010 __attribute__((__force_align_arg_pointer__))
6011 #endif
6012 pixman_implementation_t *
6013 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6014 {
6015     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6016
6017     /* SSE2 constants */
6018     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6019     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6020     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6021     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6022     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6023     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6024     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6025     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6026     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6027     mask_0080 = create_mask_16_128 (0x0080);
6028     mask_00ff = create_mask_16_128 (0x00ff);
6029     mask_0101 = create_mask_16_128 (0x0101);
6030     mask_ffff = create_mask_16_128 (0xffff);
6031     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6032     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6033
6034     /* Set up function pointers */
6035     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6036     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6037     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6038     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6039     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6040     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6041     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6042     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6043     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6044     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6045
6046     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6047
6048     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6049     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6050     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6051     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6052     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6053     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6054     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6055     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6056     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6057     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6058     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6059
6060     imp->blt = sse2_blt;
6061     imp->fill = sse2_fill;
6062
6063     imp->src_iter_init = sse2_src_iter_init;
6064
6065     return imp;
6066 }