sse2: Remove pixman-x64-mmx-emulation.h
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
34 #include <emmintrin.h> /* for SSE2 intrinsics */
35 #include "pixman-private.h"
36 #include "pixman-combine32.h"
37 #include "pixman-fast-path.h"
38
39 static __m128i mask_0080;
40 static __m128i mask_00ff;
41 static __m128i mask_0101;
42 static __m128i mask_ffff;
43 static __m128i mask_ff000000;
44 static __m128i mask_alpha;
45
46 static __m128i mask_565_r;
47 static __m128i mask_565_g1, mask_565_g2;
48 static __m128i mask_565_b;
49 static __m128i mask_red;
50 static __m128i mask_green;
51 static __m128i mask_blue;
52
53 static __m128i mask_565_fix_rb;
54 static __m128i mask_565_fix_g;
55
56 static force_inline __m128i
57 unpack_32_1x128 (uint32_t data)
58 {
59     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
60 }
61
62 static force_inline void
63 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
64 {
65     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
66     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
67 }
68
69 static force_inline __m128i
70 unpack_565_to_8888 (__m128i lo)
71 {
72     __m128i r, g, b, rb, t;
73
74     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
75     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
76     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
77
78     rb = _mm_or_si128 (r, b);
79     t  = _mm_and_si128 (rb, mask_565_fix_rb);
80     t  = _mm_srli_epi32 (t, 5);
81     rb = _mm_or_si128 (rb, t);
82
83     t  = _mm_and_si128 (g, mask_565_fix_g);
84     t  = _mm_srli_epi32 (t, 6);
85     g  = _mm_or_si128 (g, t);
86
87     return _mm_or_si128 (rb, g);
88 }
89
90 static force_inline void
91 unpack_565_128_4x128 (__m128i  data,
92                       __m128i* data0,
93                       __m128i* data1,
94                       __m128i* data2,
95                       __m128i* data3)
96 {
97     __m128i lo, hi;
98
99     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
100     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
101
102     lo = unpack_565_to_8888 (lo);
103     hi = unpack_565_to_8888 (hi);
104
105     unpack_128_2x128 (lo, data0, data1);
106     unpack_128_2x128 (hi, data2, data3);
107 }
108
109 static force_inline uint16_t
110 pack_565_32_16 (uint32_t pixel)
111 {
112     return (uint16_t) (((pixel >> 8) & 0xf800) |
113                        ((pixel >> 5) & 0x07e0) |
114                        ((pixel >> 3) & 0x001f));
115 }
116
117 static force_inline __m128i
118 pack_2x128_128 (__m128i lo, __m128i hi)
119 {
120     return _mm_packus_epi16 (lo, hi);
121 }
122
123 static force_inline __m128i
124 pack_565_2x128_128 (__m128i lo, __m128i hi)
125 {
126     __m128i data;
127     __m128i r, g1, g2, b;
128
129     data = pack_2x128_128 (lo, hi);
130
131     r  = _mm_and_si128 (data, mask_565_r);
132     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
133     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
134     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
135
136     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
137 }
138
139 static force_inline __m128i
140 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
141 {
142     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
143                              pack_565_2x128_128 (*xmm2, *xmm3));
144 }
145
146 static force_inline int
147 is_opaque (__m128i x)
148 {
149     __m128i ffs = _mm_cmpeq_epi8 (x, x);
150
151     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
152 }
153
154 static force_inline int
155 is_zero (__m128i x)
156 {
157     return _mm_movemask_epi8 (
158         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
159 }
160
161 static force_inline int
162 is_transparent (__m128i x)
163 {
164     return (_mm_movemask_epi8 (
165                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
166 }
167
168 static force_inline __m128i
169 expand_pixel_32_1x128 (uint32_t data)
170 {
171     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
172 }
173
174 static force_inline __m128i
175 expand_alpha_1x128 (__m128i data)
176 {
177     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
178                                                      _MM_SHUFFLE (3, 3, 3, 3)),
179                                 _MM_SHUFFLE (3, 3, 3, 3));
180 }
181
182 static force_inline void
183 expand_alpha_2x128 (__m128i  data_lo,
184                     __m128i  data_hi,
185                     __m128i* alpha_lo,
186                     __m128i* alpha_hi)
187 {
188     __m128i lo, hi;
189
190     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
191     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
192
193     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
194     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
195 }
196
197 static force_inline void
198 expand_alpha_rev_2x128 (__m128i  data_lo,
199                         __m128i  data_hi,
200                         __m128i* alpha_lo,
201                         __m128i* alpha_hi)
202 {
203     __m128i lo, hi;
204
205     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
206     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
207     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
208     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
209 }
210
211 static force_inline void
212 pix_multiply_2x128 (__m128i* data_lo,
213                     __m128i* data_hi,
214                     __m128i* alpha_lo,
215                     __m128i* alpha_hi,
216                     __m128i* ret_lo,
217                     __m128i* ret_hi)
218 {
219     __m128i lo, hi;
220
221     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
222     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
223     lo = _mm_adds_epu16 (lo, mask_0080);
224     hi = _mm_adds_epu16 (hi, mask_0080);
225     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
226     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
227 }
228
229 static force_inline void
230 pix_add_multiply_2x128 (__m128i* src_lo,
231                         __m128i* src_hi,
232                         __m128i* alpha_dst_lo,
233                         __m128i* alpha_dst_hi,
234                         __m128i* dst_lo,
235                         __m128i* dst_hi,
236                         __m128i* alpha_src_lo,
237                         __m128i* alpha_src_hi,
238                         __m128i* ret_lo,
239                         __m128i* ret_hi)
240 {
241     __m128i t1_lo, t1_hi;
242     __m128i t2_lo, t2_hi;
243
244     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
245     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
246
247     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
248     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
249 }
250
251 static force_inline void
252 negate_2x128 (__m128i  data_lo,
253               __m128i  data_hi,
254               __m128i* neg_lo,
255               __m128i* neg_hi)
256 {
257     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
258     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
259 }
260
261 static force_inline void
262 invert_colors_2x128 (__m128i  data_lo,
263                      __m128i  data_hi,
264                      __m128i* inv_lo,
265                      __m128i* inv_hi)
266 {
267     __m128i lo, hi;
268
269     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
270     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
271     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
272     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
273 }
274
275 static force_inline void
276 over_2x128 (__m128i* src_lo,
277             __m128i* src_hi,
278             __m128i* alpha_lo,
279             __m128i* alpha_hi,
280             __m128i* dst_lo,
281             __m128i* dst_hi)
282 {
283     __m128i t1, t2;
284
285     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
286
287     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
288
289     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
290     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
291 }
292
293 static force_inline void
294 over_rev_non_pre_2x128 (__m128i  src_lo,
295                         __m128i  src_hi,
296                         __m128i* dst_lo,
297                         __m128i* dst_hi)
298 {
299     __m128i lo, hi;
300     __m128i alpha_lo, alpha_hi;
301
302     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
303
304     lo = _mm_or_si128 (alpha_lo, mask_alpha);
305     hi = _mm_or_si128 (alpha_hi, mask_alpha);
306
307     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
308
309     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
310
311     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
312 }
313
314 static force_inline void
315 in_over_2x128 (__m128i* src_lo,
316                __m128i* src_hi,
317                __m128i* alpha_lo,
318                __m128i* alpha_hi,
319                __m128i* mask_lo,
320                __m128i* mask_hi,
321                __m128i* dst_lo,
322                __m128i* dst_hi)
323 {
324     __m128i s_lo, s_hi;
325     __m128i a_lo, a_hi;
326
327     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
328     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
329
330     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
331 }
332
333 /* load 4 pixels from a 16-byte boundary aligned address */
334 static force_inline __m128i
335 load_128_aligned (__m128i* src)
336 {
337     return _mm_load_si128 (src);
338 }
339
340 /* load 4 pixels from a unaligned address */
341 static force_inline __m128i
342 load_128_unaligned (const __m128i* src)
343 {
344     return _mm_loadu_si128 (src);
345 }
346
347 /* save 4 pixels using Write Combining memory on a 16-byte
348  * boundary aligned address
349  */
350 static force_inline void
351 save_128_write_combining (__m128i* dst,
352                           __m128i  data)
353 {
354     _mm_stream_si128 (dst, data);
355 }
356
357 /* save 4 pixels on a 16-byte boundary aligned address */
358 static force_inline void
359 save_128_aligned (__m128i* dst,
360                   __m128i  data)
361 {
362     _mm_store_si128 (dst, data);
363 }
364
365 /* save 4 pixels on a unaligned address */
366 static force_inline void
367 save_128_unaligned (__m128i* dst,
368                     __m128i  data)
369 {
370     _mm_storeu_si128 (dst, data);
371 }
372
373 static force_inline __m128i
374 load_32_1x128 (uint32_t data)
375 {
376     return _mm_cvtsi32_si128 (data);
377 }
378
379 static force_inline __m128i
380 expand_alpha_rev_1x128 (__m128i data)
381 {
382     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
383 }
384
385 static force_inline __m128i
386 expand_pixel_8_1x128 (uint8_t data)
387 {
388     return _mm_shufflelo_epi16 (
389         unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
390 }
391
392 static force_inline __m128i
393 pix_multiply_1x128 (__m128i data,
394                     __m128i alpha)
395 {
396     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
397                                             mask_0080),
398                             mask_0101);
399 }
400
401 static force_inline __m128i
402 pix_add_multiply_1x128 (__m128i* src,
403                         __m128i* alpha_dst,
404                         __m128i* dst,
405                         __m128i* alpha_src)
406 {
407     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
408     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
409
410     return _mm_adds_epu8 (t1, t2);
411 }
412
413 static force_inline __m128i
414 negate_1x128 (__m128i data)
415 {
416     return _mm_xor_si128 (data, mask_00ff);
417 }
418
419 static force_inline __m128i
420 invert_colors_1x128 (__m128i data)
421 {
422     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
423 }
424
425 static force_inline __m128i
426 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
427 {
428     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
429 }
430
431 static force_inline __m128i
432 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
433 {
434     return over_1x128 (pix_multiply_1x128 (*src, *mask),
435                        pix_multiply_1x128 (*alpha, *mask),
436                        *dst);
437 }
438
439 static force_inline __m128i
440 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
441 {
442     __m128i alpha = expand_alpha_1x128 (src);
443
444     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
445                                            _mm_or_si128 (alpha, mask_alpha)),
446                        alpha,
447                        dst);
448 }
449
450 static force_inline uint32_t
451 pack_1x128_32 (__m128i data)
452 {
453     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
454 }
455
456 static force_inline __m128i
457 expand565_16_1x128 (uint16_t pixel)
458 {
459     __m128i m = _mm_cvtsi32_si128 (pixel);
460
461     m = unpack_565_to_8888 (m);
462
463     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
464 }
465
466 static force_inline uint32_t
467 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
468 {
469     uint8_t a;
470     __m128i xmms;
471
472     a = src >> 24;
473
474     if (a == 0xff)
475     {
476         return src;
477     }
478     else if (src)
479     {
480         xmms = unpack_32_1x128 (src);
481         return pack_1x128_32 (
482             over_1x128 (xmms, expand_alpha_1x128 (xmms),
483                         unpack_32_1x128 (dst)));
484     }
485
486     return dst;
487 }
488
489 static force_inline uint32_t
490 combine1 (const uint32_t *ps, const uint32_t *pm)
491 {
492     uint32_t s = *ps;
493
494     if (pm)
495     {
496         __m128i ms, mm;
497
498         mm = unpack_32_1x128 (*pm);
499         mm = expand_alpha_1x128 (mm);
500
501         ms = unpack_32_1x128 (s);
502         ms = pix_multiply_1x128 (ms, mm);
503
504         s = pack_1x128_32 (ms);
505     }
506
507     return s;
508 }
509
510 static force_inline __m128i
511 combine4 (const __m128i *ps, const __m128i *pm)
512 {
513     __m128i xmm_src_lo, xmm_src_hi;
514     __m128i xmm_msk_lo, xmm_msk_hi;
515     __m128i s;
516
517     if (pm)
518     {
519         xmm_msk_lo = load_128_unaligned (pm);
520
521         if (is_transparent (xmm_msk_lo))
522             return _mm_setzero_si128 ();
523     }
524
525     s = load_128_unaligned (ps);
526
527     if (pm)
528     {
529         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
530         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
531
532         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
533
534         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
535                             &xmm_msk_lo, &xmm_msk_hi,
536                             &xmm_src_lo, &xmm_src_hi);
537
538         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
539     }
540
541     return s;
542 }
543
544 static force_inline void
545 core_combine_over_u_sse2_mask (uint32_t *         pd,
546                                const uint32_t*    ps,
547                                const uint32_t*    pm,
548                                int                w)
549 {
550     uint32_t s, d;
551
552     /* Align dst on a 16-byte boundary */
553     while (w && ((unsigned long)pd & 15))
554     {
555         d = *pd;
556         s = combine1 (ps, pm);
557
558         if (s)
559             *pd = core_combine_over_u_pixel_sse2 (s, d);
560         pd++;
561         ps++;
562         pm++;
563         w--;
564     }
565
566     while (w >= 4)
567     {
568         __m128i mask = load_128_unaligned ((__m128i *)pm);
569
570         if (!is_zero (mask))
571         {
572             __m128i src;
573             __m128i src_hi, src_lo;
574             __m128i mask_hi, mask_lo;
575             __m128i alpha_hi, alpha_lo;
576
577             src = load_128_unaligned ((__m128i *)ps);
578
579             if (is_opaque (_mm_and_si128 (src, mask)))
580             {
581                 save_128_aligned ((__m128i *)pd, src);
582             }
583             else
584             {
585                 __m128i dst = load_128_aligned ((__m128i *)pd);
586                 __m128i dst_hi, dst_lo;
587
588                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
589                 unpack_128_2x128 (src, &src_lo, &src_hi);
590
591                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
592                 pix_multiply_2x128 (&src_lo, &src_hi,
593                                     &mask_lo, &mask_hi,
594                                     &src_lo, &src_hi);
595
596                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
597
598                 expand_alpha_2x128 (src_lo, src_hi,
599                                     &alpha_lo, &alpha_hi);
600
601                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
602                             &dst_lo, &dst_hi);
603
604                 save_128_aligned (
605                     (__m128i *)pd,
606                     pack_2x128_128 (dst_lo, dst_hi));
607             }
608         }
609
610         pm += 4;
611         ps += 4;
612         pd += 4;
613         w -= 4;
614     }
615     while (w)
616     {
617         d = *pd;
618         s = combine1 (ps, pm);
619
620         if (s)
621             *pd = core_combine_over_u_pixel_sse2 (s, d);
622         pd++;
623         ps++;
624         pm++;
625
626         w--;
627     }
628 }
629
630 static force_inline void
631 core_combine_over_u_sse2_no_mask (uint32_t *      pd,
632                                   const uint32_t*    ps,
633                                   int                w)
634 {
635     uint32_t s, d;
636
637     /* Align dst on a 16-byte boundary */
638     while (w && ((unsigned long)pd & 15))
639     {
640         d = *pd;
641         s = *ps;
642
643         if (s)
644             *pd = core_combine_over_u_pixel_sse2 (s, d);
645         pd++;
646         ps++;
647         w--;
648     }
649
650     while (w >= 4)
651     {
652         __m128i src;
653         __m128i src_hi, src_lo, dst_hi, dst_lo;
654         __m128i alpha_hi, alpha_lo;
655
656         src = load_128_unaligned ((__m128i *)ps);
657
658         if (!is_zero (src))
659         {
660             if (is_opaque (src))
661             {
662                 save_128_aligned ((__m128i *)pd, src);
663             }
664             else
665             {
666                 __m128i dst = load_128_aligned ((__m128i *)pd);
667
668                 unpack_128_2x128 (src, &src_lo, &src_hi);
669                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
670
671                 expand_alpha_2x128 (src_lo, src_hi,
672                                     &alpha_lo, &alpha_hi);
673                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
674                             &dst_lo, &dst_hi);
675
676                 save_128_aligned (
677                     (__m128i *)pd,
678                     pack_2x128_128 (dst_lo, dst_hi));
679             }
680         }
681
682         ps += 4;
683         pd += 4;
684         w -= 4;
685     }
686     while (w)
687     {
688         d = *pd;
689         s = *ps;
690
691         if (s)
692             *pd = core_combine_over_u_pixel_sse2 (s, d);
693         pd++;
694         ps++;
695
696         w--;
697     }
698 }
699
700 static force_inline void
701 sse2_combine_over_u (pixman_implementation_t *imp,
702                      pixman_op_t              op,
703                      uint32_t *               pd,
704                      const uint32_t *         ps,
705                      const uint32_t *         pm,
706                      int                      w)
707 {
708     if (pm)
709         core_combine_over_u_sse2_mask (pd, ps, pm, w);
710     else
711         core_combine_over_u_sse2_no_mask (pd, ps, w);
712 }
713
714 static void
715 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
716                              pixman_op_t              op,
717                              uint32_t *               pd,
718                              const uint32_t *         ps,
719                              const uint32_t *         pm,
720                              int                      w)
721 {
722     uint32_t s, d;
723
724     __m128i xmm_dst_lo, xmm_dst_hi;
725     __m128i xmm_src_lo, xmm_src_hi;
726     __m128i xmm_alpha_lo, xmm_alpha_hi;
727
728     /* Align dst on a 16-byte boundary */
729     while (w &&
730            ((unsigned long)pd & 15))
731     {
732         d = *pd;
733         s = combine1 (ps, pm);
734
735         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
736         w--;
737         ps++;
738         if (pm)
739             pm++;
740     }
741
742     while (w >= 4)
743     {
744         /* I'm loading unaligned because I'm not sure
745          * about the address alignment.
746          */
747         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
748         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
749
750         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
751         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
752
753         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
754                             &xmm_alpha_lo, &xmm_alpha_hi);
755
756         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
757                     &xmm_alpha_lo, &xmm_alpha_hi,
758                     &xmm_src_lo, &xmm_src_hi);
759
760         /* rebuid the 4 pixel data and save*/
761         save_128_aligned ((__m128i*)pd,
762                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
763
764         w -= 4;
765         ps += 4;
766         pd += 4;
767
768         if (pm)
769             pm += 4;
770     }
771
772     while (w)
773     {
774         d = *pd;
775         s = combine1 (ps, pm);
776
777         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
778         ps++;
779         w--;
780         if (pm)
781             pm++;
782     }
783 }
784
785 static force_inline uint32_t
786 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
787 {
788     uint32_t maska = src >> 24;
789
790     if (maska == 0)
791     {
792         return 0;
793     }
794     else if (maska != 0xff)
795     {
796         return pack_1x128_32 (
797             pix_multiply_1x128 (unpack_32_1x128 (dst),
798                                 expand_alpha_1x128 (unpack_32_1x128 (src))));
799     }
800
801     return dst;
802 }
803
804 static void
805 sse2_combine_in_u (pixman_implementation_t *imp,
806                    pixman_op_t              op,
807                    uint32_t *               pd,
808                    const uint32_t *         ps,
809                    const uint32_t *         pm,
810                    int                      w)
811 {
812     uint32_t s, d;
813
814     __m128i xmm_src_lo, xmm_src_hi;
815     __m128i xmm_dst_lo, xmm_dst_hi;
816
817     while (w && ((unsigned long) pd & 15))
818     {
819         s = combine1 (ps, pm);
820         d = *pd;
821
822         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
823         w--;
824         ps++;
825         if (pm)
826             pm++;
827     }
828
829     while (w >= 4)
830     {
831         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
832         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
833
834         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
835         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
836
837         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
838         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
839                             &xmm_dst_lo, &xmm_dst_hi,
840                             &xmm_dst_lo, &xmm_dst_hi);
841
842         save_128_aligned ((__m128i*)pd,
843                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
844
845         ps += 4;
846         pd += 4;
847         w -= 4;
848         if (pm)
849             pm += 4;
850     }
851
852     while (w)
853     {
854         s = combine1 (ps, pm);
855         d = *pd;
856
857         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
858         w--;
859         ps++;
860         if (pm)
861             pm++;
862     }
863 }
864
865 static void
866 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
867                            pixman_op_t              op,
868                            uint32_t *               pd,
869                            const uint32_t *         ps,
870                            const uint32_t *         pm,
871                            int                      w)
872 {
873     uint32_t s, d;
874
875     __m128i xmm_src_lo, xmm_src_hi;
876     __m128i xmm_dst_lo, xmm_dst_hi;
877
878     while (w && ((unsigned long) pd & 15))
879     {
880         s = combine1 (ps, pm);
881         d = *pd;
882
883         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
884         ps++;
885         w--;
886         if (pm)
887             pm++;
888     }
889
890     while (w >= 4)
891     {
892         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
893         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
894
895         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
896         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
897
898         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
899         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
900                             &xmm_src_lo, &xmm_src_hi,
901                             &xmm_dst_lo, &xmm_dst_hi);
902
903         save_128_aligned (
904             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
905
906         ps += 4;
907         pd += 4;
908         w -= 4;
909         if (pm)
910             pm += 4;
911     }
912
913     while (w)
914     {
915         s = combine1 (ps, pm);
916         d = *pd;
917
918         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
919         w--;
920         ps++;
921         if (pm)
922             pm++;
923     }
924 }
925
926 static void
927 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
928                             pixman_op_t              op,
929                             uint32_t *               pd,
930                             const uint32_t *         ps,
931                             const uint32_t *         pm,
932                             int                      w)
933 {
934     while (w && ((unsigned long) pd & 15))
935     {
936         uint32_t s = combine1 (ps, pm);
937         uint32_t d = *pd;
938
939         *pd++ = pack_1x128_32 (
940             pix_multiply_1x128 (
941                 unpack_32_1x128 (d), negate_1x128 (
942                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
943
944         if (pm)
945             pm++;
946         ps++;
947         w--;
948     }
949
950     while (w >= 4)
951     {
952         __m128i xmm_src_lo, xmm_src_hi;
953         __m128i xmm_dst_lo, xmm_dst_hi;
954
955         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
956         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
957
958         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
959         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
960
961         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
962         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
963
964         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
965                             &xmm_src_lo, &xmm_src_hi,
966                             &xmm_dst_lo, &xmm_dst_hi);
967
968         save_128_aligned (
969             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
970
971         ps += 4;
972         pd += 4;
973         if (pm)
974             pm += 4;
975
976         w -= 4;
977     }
978
979     while (w)
980     {
981         uint32_t s = combine1 (ps, pm);
982         uint32_t d = *pd;
983
984         *pd++ = pack_1x128_32 (
985             pix_multiply_1x128 (
986                 unpack_32_1x128 (d), negate_1x128 (
987                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
988         ps++;
989         if (pm)
990             pm++;
991         w--;
992     }
993 }
994
995 static void
996 sse2_combine_out_u (pixman_implementation_t *imp,
997                     pixman_op_t              op,
998                     uint32_t *               pd,
999                     const uint32_t *         ps,
1000                     const uint32_t *         pm,
1001                     int                      w)
1002 {
1003     while (w && ((unsigned long) pd & 15))
1004     {
1005         uint32_t s = combine1 (ps, pm);
1006         uint32_t d = *pd;
1007
1008         *pd++ = pack_1x128_32 (
1009             pix_multiply_1x128 (
1010                 unpack_32_1x128 (s), negate_1x128 (
1011                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1012         w--;
1013         ps++;
1014         if (pm)
1015             pm++;
1016     }
1017
1018     while (w >= 4)
1019     {
1020         __m128i xmm_src_lo, xmm_src_hi;
1021         __m128i xmm_dst_lo, xmm_dst_hi;
1022
1023         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1024         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1025
1026         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1027         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1028
1029         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1030         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1031
1032         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1033                             &xmm_dst_lo, &xmm_dst_hi,
1034                             &xmm_dst_lo, &xmm_dst_hi);
1035
1036         save_128_aligned (
1037             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1038
1039         ps += 4;
1040         pd += 4;
1041         w -= 4;
1042         if (pm)
1043             pm += 4;
1044     }
1045
1046     while (w)
1047     {
1048         uint32_t s = combine1 (ps, pm);
1049         uint32_t d = *pd;
1050
1051         *pd++ = pack_1x128_32 (
1052             pix_multiply_1x128 (
1053                 unpack_32_1x128 (s), negate_1x128 (
1054                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1055         w--;
1056         ps++;
1057         if (pm)
1058             pm++;
1059     }
1060 }
1061
1062 static force_inline uint32_t
1063 core_combine_atop_u_pixel_sse2 (uint32_t src,
1064                                 uint32_t dst)
1065 {
1066     __m128i s = unpack_32_1x128 (src);
1067     __m128i d = unpack_32_1x128 (dst);
1068
1069     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1070     __m128i da = expand_alpha_1x128 (d);
1071
1072     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1073 }
1074
1075 static void
1076 sse2_combine_atop_u (pixman_implementation_t *imp,
1077                      pixman_op_t              op,
1078                      uint32_t *               pd,
1079                      const uint32_t *         ps,
1080                      const uint32_t *         pm,
1081                      int                      w)
1082 {
1083     uint32_t s, d;
1084
1085     __m128i xmm_src_lo, xmm_src_hi;
1086     __m128i xmm_dst_lo, xmm_dst_hi;
1087     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1088     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1089
1090     while (w && ((unsigned long) pd & 15))
1091     {
1092         s = combine1 (ps, pm);
1093         d = *pd;
1094
1095         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1096         w--;
1097         ps++;
1098         if (pm)
1099             pm++;
1100     }
1101
1102     while (w >= 4)
1103     {
1104         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1105         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1106
1107         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1108         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1109
1110         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1111                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1112         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1113                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1114
1115         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1116                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1117
1118         pix_add_multiply_2x128 (
1119             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1120             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1121             &xmm_dst_lo, &xmm_dst_hi);
1122
1123         save_128_aligned (
1124             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1125
1126         ps += 4;
1127         pd += 4;
1128         w -= 4;
1129         if (pm)
1130             pm += 4;
1131     }
1132
1133     while (w)
1134     {
1135         s = combine1 (ps, pm);
1136         d = *pd;
1137
1138         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1139         w--;
1140         ps++;
1141         if (pm)
1142             pm++;
1143     }
1144 }
1145
1146 static force_inline uint32_t
1147 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1148                                         uint32_t dst)
1149 {
1150     __m128i s = unpack_32_1x128 (src);
1151     __m128i d = unpack_32_1x128 (dst);
1152
1153     __m128i sa = expand_alpha_1x128 (s);
1154     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1155
1156     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1157 }
1158
1159 static void
1160 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1161                              pixman_op_t              op,
1162                              uint32_t *               pd,
1163                              const uint32_t *         ps,
1164                              const uint32_t *         pm,
1165                              int                      w)
1166 {
1167     uint32_t s, d;
1168
1169     __m128i xmm_src_lo, xmm_src_hi;
1170     __m128i xmm_dst_lo, xmm_dst_hi;
1171     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1172     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1173
1174     while (w && ((unsigned long) pd & 15))
1175     {
1176         s = combine1 (ps, pm);
1177         d = *pd;
1178
1179         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1180         ps++;
1181         w--;
1182         if (pm)
1183             pm++;
1184     }
1185
1186     while (w >= 4)
1187     {
1188         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1189         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1190
1191         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1192         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1193
1194         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1195                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1196         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1197                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1198
1199         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1200                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1201
1202         pix_add_multiply_2x128 (
1203             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1204             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1205             &xmm_dst_lo, &xmm_dst_hi);
1206
1207         save_128_aligned (
1208             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1209
1210         ps += 4;
1211         pd += 4;
1212         w -= 4;
1213         if (pm)
1214             pm += 4;
1215     }
1216
1217     while (w)
1218     {
1219         s = combine1 (ps, pm);
1220         d = *pd;
1221
1222         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1223         ps++;
1224         w--;
1225         if (pm)
1226             pm++;
1227     }
1228 }
1229
1230 static force_inline uint32_t
1231 core_combine_xor_u_pixel_sse2 (uint32_t src,
1232                                uint32_t dst)
1233 {
1234     __m128i s = unpack_32_1x128 (src);
1235     __m128i d = unpack_32_1x128 (dst);
1236
1237     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1238     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1239
1240     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1241 }
1242
1243 static void
1244 sse2_combine_xor_u (pixman_implementation_t *imp,
1245                     pixman_op_t              op,
1246                     uint32_t *               dst,
1247                     const uint32_t *         src,
1248                     const uint32_t *         mask,
1249                     int                      width)
1250 {
1251     int w = width;
1252     uint32_t s, d;
1253     uint32_t* pd = dst;
1254     const uint32_t* ps = src;
1255     const uint32_t* pm = mask;
1256
1257     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1258     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1259     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1260     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1261
1262     while (w && ((unsigned long) pd & 15))
1263     {
1264         s = combine1 (ps, pm);
1265         d = *pd;
1266
1267         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1268         w--;
1269         ps++;
1270         if (pm)
1271             pm++;
1272     }
1273
1274     while (w >= 4)
1275     {
1276         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1277         xmm_dst = load_128_aligned ((__m128i*) pd);
1278
1279         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1280         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1281
1282         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1283                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1284         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1285                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1286
1287         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1288                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1289         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1290                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1291
1292         pix_add_multiply_2x128 (
1293             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1294             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1295             &xmm_dst_lo, &xmm_dst_hi);
1296
1297         save_128_aligned (
1298             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1299
1300         ps += 4;
1301         pd += 4;
1302         w -= 4;
1303         if (pm)
1304             pm += 4;
1305     }
1306
1307     while (w)
1308     {
1309         s = combine1 (ps, pm);
1310         d = *pd;
1311
1312         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1313         w--;
1314         ps++;
1315         if (pm)
1316             pm++;
1317     }
1318 }
1319
1320 static force_inline void
1321 sse2_combine_add_u (pixman_implementation_t *imp,
1322                     pixman_op_t              op,
1323                     uint32_t *               dst,
1324                     const uint32_t *         src,
1325                     const uint32_t *         mask,
1326                     int                      width)
1327 {
1328     int w = width;
1329     uint32_t s, d;
1330     uint32_t* pd = dst;
1331     const uint32_t* ps = src;
1332     const uint32_t* pm = mask;
1333
1334     while (w && (unsigned long)pd & 15)
1335     {
1336         s = combine1 (ps, pm);
1337         d = *pd;
1338
1339         ps++;
1340         if (pm)
1341             pm++;
1342         *pd++ = _mm_cvtsi128_si32 (
1343             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1344         w--;
1345     }
1346
1347     while (w >= 4)
1348     {
1349         __m128i s;
1350
1351         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1352
1353         save_128_aligned (
1354             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1355
1356         pd += 4;
1357         ps += 4;
1358         if (pm)
1359             pm += 4;
1360         w -= 4;
1361     }
1362
1363     while (w--)
1364     {
1365         s = combine1 (ps, pm);
1366         d = *pd;
1367
1368         ps++;
1369         *pd++ = _mm_cvtsi128_si32 (
1370             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1371         if (pm)
1372             pm++;
1373     }
1374 }
1375
1376 static force_inline uint32_t
1377 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1378                                     uint32_t dst)
1379 {
1380     __m128i ms = unpack_32_1x128 (src);
1381     __m128i md = unpack_32_1x128 (dst);
1382     uint32_t sa = src >> 24;
1383     uint32_t da = ~dst >> 24;
1384
1385     if (sa > da)
1386     {
1387         ms = pix_multiply_1x128 (
1388             ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1389     }
1390
1391     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1392 }
1393
1394 static void
1395 sse2_combine_saturate_u (pixman_implementation_t *imp,
1396                          pixman_op_t              op,
1397                          uint32_t *               pd,
1398                          const uint32_t *         ps,
1399                          const uint32_t *         pm,
1400                          int                      w)
1401 {
1402     uint32_t s, d;
1403
1404     uint32_t pack_cmp;
1405     __m128i xmm_src, xmm_dst;
1406
1407     while (w && (unsigned long)pd & 15)
1408     {
1409         s = combine1 (ps, pm);
1410         d = *pd;
1411
1412         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1413         w--;
1414         ps++;
1415         if (pm)
1416             pm++;
1417     }
1418
1419     while (w >= 4)
1420     {
1421         xmm_dst = load_128_aligned  ((__m128i*)pd);
1422         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1423
1424         pack_cmp = _mm_movemask_epi8 (
1425             _mm_cmpgt_epi32 (
1426                 _mm_srli_epi32 (xmm_src, 24),
1427                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1428
1429         /* if some alpha src is grater than respective ~alpha dst */
1430         if (pack_cmp)
1431         {
1432             s = combine1 (ps++, pm);
1433             d = *pd;
1434             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1435             if (pm)
1436                 pm++;
1437
1438             s = combine1 (ps++, pm);
1439             d = *pd;
1440             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1441             if (pm)
1442                 pm++;
1443
1444             s = combine1 (ps++, pm);
1445             d = *pd;
1446             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1447             if (pm)
1448                 pm++;
1449
1450             s = combine1 (ps++, pm);
1451             d = *pd;
1452             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1453             if (pm)
1454                 pm++;
1455         }
1456         else
1457         {
1458             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1459
1460             pd += 4;
1461             ps += 4;
1462             if (pm)
1463                 pm += 4;
1464         }
1465
1466         w -= 4;
1467     }
1468
1469     while (w--)
1470     {
1471         s = combine1 (ps, pm);
1472         d = *pd;
1473
1474         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1475         ps++;
1476         if (pm)
1477             pm++;
1478     }
1479 }
1480
1481 static void
1482 sse2_combine_src_ca (pixman_implementation_t *imp,
1483                      pixman_op_t              op,
1484                      uint32_t *               pd,
1485                      const uint32_t *         ps,
1486                      const uint32_t *         pm,
1487                      int                      w)
1488 {
1489     uint32_t s, m;
1490
1491     __m128i xmm_src_lo, xmm_src_hi;
1492     __m128i xmm_mask_lo, xmm_mask_hi;
1493     __m128i xmm_dst_lo, xmm_dst_hi;
1494
1495     while (w && (unsigned long)pd & 15)
1496     {
1497         s = *ps++;
1498         m = *pm++;
1499         *pd++ = pack_1x128_32 (
1500             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1501         w--;
1502     }
1503
1504     while (w >= 4)
1505     {
1506         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1507         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1508
1509         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1510         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1511
1512         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1513                             &xmm_mask_lo, &xmm_mask_hi,
1514                             &xmm_dst_lo, &xmm_dst_hi);
1515
1516         save_128_aligned (
1517             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1518
1519         ps += 4;
1520         pd += 4;
1521         pm += 4;
1522         w -= 4;
1523     }
1524
1525     while (w)
1526     {
1527         s = *ps++;
1528         m = *pm++;
1529         *pd++ = pack_1x128_32 (
1530             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1531         w--;
1532     }
1533 }
1534
1535 static force_inline uint32_t
1536 core_combine_over_ca_pixel_sse2 (uint32_t src,
1537                                  uint32_t mask,
1538                                  uint32_t dst)
1539 {
1540     __m128i s = unpack_32_1x128 (src);
1541     __m128i expAlpha = expand_alpha_1x128 (s);
1542     __m128i unpk_mask = unpack_32_1x128 (mask);
1543     __m128i unpk_dst  = unpack_32_1x128 (dst);
1544
1545     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1546 }
1547
1548 static void
1549 sse2_combine_over_ca (pixman_implementation_t *imp,
1550                       pixman_op_t              op,
1551                       uint32_t *               pd,
1552                       const uint32_t *         ps,
1553                       const uint32_t *         pm,
1554                       int                      w)
1555 {
1556     uint32_t s, m, d;
1557
1558     __m128i xmm_alpha_lo, xmm_alpha_hi;
1559     __m128i xmm_src_lo, xmm_src_hi;
1560     __m128i xmm_dst_lo, xmm_dst_hi;
1561     __m128i xmm_mask_lo, xmm_mask_hi;
1562
1563     while (w && (unsigned long)pd & 15)
1564     {
1565         s = *ps++;
1566         m = *pm++;
1567         d = *pd;
1568
1569         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1570         w--;
1571     }
1572
1573     while (w >= 4)
1574     {
1575         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1576         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1577         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1578
1579         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1580         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1581         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1582
1583         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1584                             &xmm_alpha_lo, &xmm_alpha_hi);
1585
1586         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1587                        &xmm_alpha_lo, &xmm_alpha_hi,
1588                        &xmm_mask_lo, &xmm_mask_hi,
1589                        &xmm_dst_lo, &xmm_dst_hi);
1590
1591         save_128_aligned (
1592             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1593
1594         ps += 4;
1595         pd += 4;
1596         pm += 4;
1597         w -= 4;
1598     }
1599
1600     while (w)
1601     {
1602         s = *ps++;
1603         m = *pm++;
1604         d = *pd;
1605
1606         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1607         w--;
1608     }
1609 }
1610
1611 static force_inline uint32_t
1612 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1613                                          uint32_t mask,
1614                                          uint32_t dst)
1615 {
1616     __m128i d = unpack_32_1x128 (dst);
1617
1618     return pack_1x128_32 (
1619         over_1x128 (d, expand_alpha_1x128 (d),
1620                     pix_multiply_1x128 (unpack_32_1x128 (src),
1621                                         unpack_32_1x128 (mask))));
1622 }
1623
1624 static void
1625 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1626                               pixman_op_t              op,
1627                               uint32_t *               pd,
1628                               const uint32_t *         ps,
1629                               const uint32_t *         pm,
1630                               int                      w)
1631 {
1632     uint32_t s, m, d;
1633
1634     __m128i xmm_alpha_lo, xmm_alpha_hi;
1635     __m128i xmm_src_lo, xmm_src_hi;
1636     __m128i xmm_dst_lo, xmm_dst_hi;
1637     __m128i xmm_mask_lo, xmm_mask_hi;
1638
1639     while (w && (unsigned long)pd & 15)
1640     {
1641         s = *ps++;
1642         m = *pm++;
1643         d = *pd;
1644
1645         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1646         w--;
1647     }
1648
1649     while (w >= 4)
1650     {
1651         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1652         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1653         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1654
1655         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1656         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1657         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1658
1659         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1660                             &xmm_alpha_lo, &xmm_alpha_hi);
1661         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1662                             &xmm_mask_lo, &xmm_mask_hi,
1663                             &xmm_mask_lo, &xmm_mask_hi);
1664
1665         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1666                     &xmm_alpha_lo, &xmm_alpha_hi,
1667                     &xmm_mask_lo, &xmm_mask_hi);
1668
1669         save_128_aligned (
1670             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1671
1672         ps += 4;
1673         pd += 4;
1674         pm += 4;
1675         w -= 4;
1676     }
1677
1678     while (w)
1679     {
1680         s = *ps++;
1681         m = *pm++;
1682         d = *pd;
1683
1684         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1685         w--;
1686     }
1687 }
1688
1689 static void
1690 sse2_combine_in_ca (pixman_implementation_t *imp,
1691                     pixman_op_t              op,
1692                     uint32_t *               pd,
1693                     const uint32_t *         ps,
1694                     const uint32_t *         pm,
1695                     int                      w)
1696 {
1697     uint32_t s, m, d;
1698
1699     __m128i xmm_alpha_lo, xmm_alpha_hi;
1700     __m128i xmm_src_lo, xmm_src_hi;
1701     __m128i xmm_dst_lo, xmm_dst_hi;
1702     __m128i xmm_mask_lo, xmm_mask_hi;
1703
1704     while (w && (unsigned long)pd & 15)
1705     {
1706         s = *ps++;
1707         m = *pm++;
1708         d = *pd;
1709
1710         *pd++ = pack_1x128_32 (
1711             pix_multiply_1x128 (
1712                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1713                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1714
1715         w--;
1716     }
1717
1718     while (w >= 4)
1719     {
1720         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1721         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1722         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1723
1724         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1725         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1726         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1727
1728         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1729                             &xmm_alpha_lo, &xmm_alpha_hi);
1730
1731         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1732                             &xmm_mask_lo, &xmm_mask_hi,
1733                             &xmm_dst_lo, &xmm_dst_hi);
1734
1735         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1736                             &xmm_alpha_lo, &xmm_alpha_hi,
1737                             &xmm_dst_lo, &xmm_dst_hi);
1738
1739         save_128_aligned (
1740             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1741
1742         ps += 4;
1743         pd += 4;
1744         pm += 4;
1745         w -= 4;
1746     }
1747
1748     while (w)
1749     {
1750         s = *ps++;
1751         m = *pm++;
1752         d = *pd;
1753
1754         *pd++ = pack_1x128_32 (
1755             pix_multiply_1x128 (
1756                 pix_multiply_1x128 (
1757                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1758                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1759
1760         w--;
1761     }
1762 }
1763
1764 static void
1765 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1766                             pixman_op_t              op,
1767                             uint32_t *               pd,
1768                             const uint32_t *         ps,
1769                             const uint32_t *         pm,
1770                             int                      w)
1771 {
1772     uint32_t s, m, d;
1773
1774     __m128i xmm_alpha_lo, xmm_alpha_hi;
1775     __m128i xmm_src_lo, xmm_src_hi;
1776     __m128i xmm_dst_lo, xmm_dst_hi;
1777     __m128i xmm_mask_lo, xmm_mask_hi;
1778
1779     while (w && (unsigned long)pd & 15)
1780     {
1781         s = *ps++;
1782         m = *pm++;
1783         d = *pd;
1784
1785         *pd++ = pack_1x128_32 (
1786             pix_multiply_1x128 (
1787                 unpack_32_1x128 (d),
1788                 pix_multiply_1x128 (unpack_32_1x128 (m),
1789                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1790         w--;
1791     }
1792
1793     while (w >= 4)
1794     {
1795         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1796         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1797         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1798
1799         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1800         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1801         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1802
1803         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1804                             &xmm_alpha_lo, &xmm_alpha_hi);
1805         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1806                             &xmm_alpha_lo, &xmm_alpha_hi,
1807                             &xmm_alpha_lo, &xmm_alpha_hi);
1808
1809         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1810                             &xmm_alpha_lo, &xmm_alpha_hi,
1811                             &xmm_dst_lo, &xmm_dst_hi);
1812
1813         save_128_aligned (
1814             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1815
1816         ps += 4;
1817         pd += 4;
1818         pm += 4;
1819         w -= 4;
1820     }
1821
1822     while (w)
1823     {
1824         s = *ps++;
1825         m = *pm++;
1826         d = *pd;
1827
1828         *pd++ = pack_1x128_32 (
1829             pix_multiply_1x128 (
1830                 unpack_32_1x128 (d),
1831                 pix_multiply_1x128 (unpack_32_1x128 (m),
1832                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1833         w--;
1834     }
1835 }
1836
1837 static void
1838 sse2_combine_out_ca (pixman_implementation_t *imp,
1839                      pixman_op_t              op,
1840                      uint32_t *               pd,
1841                      const uint32_t *         ps,
1842                      const uint32_t *         pm,
1843                      int                      w)
1844 {
1845     uint32_t s, m, d;
1846
1847     __m128i xmm_alpha_lo, xmm_alpha_hi;
1848     __m128i xmm_src_lo, xmm_src_hi;
1849     __m128i xmm_dst_lo, xmm_dst_hi;
1850     __m128i xmm_mask_lo, xmm_mask_hi;
1851
1852     while (w && (unsigned long)pd & 15)
1853     {
1854         s = *ps++;
1855         m = *pm++;
1856         d = *pd;
1857
1858         *pd++ = pack_1x128_32 (
1859             pix_multiply_1x128 (
1860                 pix_multiply_1x128 (
1861                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1862                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1863         w--;
1864     }
1865
1866     while (w >= 4)
1867     {
1868         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1869         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1870         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1871
1872         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1873         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1874         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1875
1876         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1877                             &xmm_alpha_lo, &xmm_alpha_hi);
1878         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1879                       &xmm_alpha_lo, &xmm_alpha_hi);
1880
1881         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1882                             &xmm_mask_lo, &xmm_mask_hi,
1883                             &xmm_dst_lo, &xmm_dst_hi);
1884         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1885                             &xmm_alpha_lo, &xmm_alpha_hi,
1886                             &xmm_dst_lo, &xmm_dst_hi);
1887
1888         save_128_aligned (
1889             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1890
1891         ps += 4;
1892         pd += 4;
1893         pm += 4;
1894         w -= 4;
1895     }
1896
1897     while (w)
1898     {
1899         s = *ps++;
1900         m = *pm++;
1901         d = *pd;
1902
1903         *pd++ = pack_1x128_32 (
1904             pix_multiply_1x128 (
1905                 pix_multiply_1x128 (
1906                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1907                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1908
1909         w--;
1910     }
1911 }
1912
1913 static void
1914 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1915                              pixman_op_t              op,
1916                              uint32_t *               pd,
1917                              const uint32_t *         ps,
1918                              const uint32_t *         pm,
1919                              int                      w)
1920 {
1921     uint32_t s, m, d;
1922
1923     __m128i xmm_alpha_lo, xmm_alpha_hi;
1924     __m128i xmm_src_lo, xmm_src_hi;
1925     __m128i xmm_dst_lo, xmm_dst_hi;
1926     __m128i xmm_mask_lo, xmm_mask_hi;
1927
1928     while (w && (unsigned long)pd & 15)
1929     {
1930         s = *ps++;
1931         m = *pm++;
1932         d = *pd;
1933
1934         *pd++ = pack_1x128_32 (
1935             pix_multiply_1x128 (
1936                 unpack_32_1x128 (d),
1937                 negate_1x128 (pix_multiply_1x128 (
1938                                  unpack_32_1x128 (m),
1939                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1940         w--;
1941     }
1942
1943     while (w >= 4)
1944     {
1945         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1946         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1947         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1948
1949         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1950         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1951         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1952
1953         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1954                             &xmm_alpha_lo, &xmm_alpha_hi);
1955
1956         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1957                             &xmm_alpha_lo, &xmm_alpha_hi,
1958                             &xmm_mask_lo, &xmm_mask_hi);
1959
1960         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1961                       &xmm_mask_lo, &xmm_mask_hi);
1962
1963         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1964                             &xmm_mask_lo, &xmm_mask_hi,
1965                             &xmm_dst_lo, &xmm_dst_hi);
1966
1967         save_128_aligned (
1968             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1969
1970         ps += 4;
1971         pd += 4;
1972         pm += 4;
1973         w -= 4;
1974     }
1975
1976     while (w)
1977     {
1978         s = *ps++;
1979         m = *pm++;
1980         d = *pd;
1981
1982         *pd++ = pack_1x128_32 (
1983             pix_multiply_1x128 (
1984                 unpack_32_1x128 (d),
1985                 negate_1x128 (pix_multiply_1x128 (
1986                                  unpack_32_1x128 (m),
1987                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1988         w--;
1989     }
1990 }
1991
1992 static force_inline uint32_t
1993 core_combine_atop_ca_pixel_sse2 (uint32_t src,
1994                                  uint32_t mask,
1995                                  uint32_t dst)
1996 {
1997     __m128i m = unpack_32_1x128 (mask);
1998     __m128i s = unpack_32_1x128 (src);
1999     __m128i d = unpack_32_1x128 (dst);
2000     __m128i sa = expand_alpha_1x128 (s);
2001     __m128i da = expand_alpha_1x128 (d);
2002
2003     s = pix_multiply_1x128 (s, m);
2004     m = negate_1x128 (pix_multiply_1x128 (m, sa));
2005
2006     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2007 }
2008
2009 static void
2010 sse2_combine_atop_ca (pixman_implementation_t *imp,
2011                       pixman_op_t              op,
2012                       uint32_t *               pd,
2013                       const uint32_t *         ps,
2014                       const uint32_t *         pm,
2015                       int                      w)
2016 {
2017     uint32_t s, m, d;
2018
2019     __m128i xmm_src_lo, xmm_src_hi;
2020     __m128i xmm_dst_lo, xmm_dst_hi;
2021     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2022     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2023     __m128i xmm_mask_lo, xmm_mask_hi;
2024
2025     while (w && (unsigned long)pd & 15)
2026     {
2027         s = *ps++;
2028         m = *pm++;
2029         d = *pd;
2030
2031         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2032         w--;
2033     }
2034
2035     while (w >= 4)
2036     {
2037         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2038         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2039         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2040
2041         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2042         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2043         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2044
2045         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2046                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2047         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2048                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2049
2050         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2051                             &xmm_mask_lo, &xmm_mask_hi,
2052                             &xmm_src_lo, &xmm_src_hi);
2053         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2054                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2055                             &xmm_mask_lo, &xmm_mask_hi);
2056
2057         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2058
2059         pix_add_multiply_2x128 (
2060             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2061             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2062             &xmm_dst_lo, &xmm_dst_hi);
2063
2064         save_128_aligned (
2065             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2066
2067         ps += 4;
2068         pd += 4;
2069         pm += 4;
2070         w -= 4;
2071     }
2072
2073     while (w)
2074     {
2075         s = *ps++;
2076         m = *pm++;
2077         d = *pd;
2078
2079         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2080         w--;
2081     }
2082 }
2083
2084 static force_inline uint32_t
2085 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2086                                          uint32_t mask,
2087                                          uint32_t dst)
2088 {
2089     __m128i m = unpack_32_1x128 (mask);
2090     __m128i s = unpack_32_1x128 (src);
2091     __m128i d = unpack_32_1x128 (dst);
2092
2093     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2094     __m128i sa = expand_alpha_1x128 (s);
2095
2096     s = pix_multiply_1x128 (s, m);
2097     m = pix_multiply_1x128 (m, sa);
2098
2099     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2100 }
2101
2102 static void
2103 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2104                               pixman_op_t              op,
2105                               uint32_t *               pd,
2106                               const uint32_t *         ps,
2107                               const uint32_t *         pm,
2108                               int                      w)
2109 {
2110     uint32_t s, m, d;
2111
2112     __m128i xmm_src_lo, xmm_src_hi;
2113     __m128i xmm_dst_lo, xmm_dst_hi;
2114     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2115     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2116     __m128i xmm_mask_lo, xmm_mask_hi;
2117
2118     while (w && (unsigned long)pd & 15)
2119     {
2120         s = *ps++;
2121         m = *pm++;
2122         d = *pd;
2123
2124         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2125         w--;
2126     }
2127
2128     while (w >= 4)
2129     {
2130         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2131         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2132         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2133
2134         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2135         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2136         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2137
2138         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2139                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2140         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2141                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2142
2143         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2144                             &xmm_mask_lo, &xmm_mask_hi,
2145                             &xmm_src_lo, &xmm_src_hi);
2146         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2147                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2148                             &xmm_mask_lo, &xmm_mask_hi);
2149
2150         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2151                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2152
2153         pix_add_multiply_2x128 (
2154             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2155             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2156             &xmm_dst_lo, &xmm_dst_hi);
2157
2158         save_128_aligned (
2159             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2160
2161         ps += 4;
2162         pd += 4;
2163         pm += 4;
2164         w -= 4;
2165     }
2166
2167     while (w)
2168     {
2169         s = *ps++;
2170         m = *pm++;
2171         d = *pd;
2172
2173         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2174         w--;
2175     }
2176 }
2177
2178 static force_inline uint32_t
2179 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2180                                 uint32_t mask,
2181                                 uint32_t dst)
2182 {
2183     __m128i a = unpack_32_1x128 (mask);
2184     __m128i s = unpack_32_1x128 (src);
2185     __m128i d = unpack_32_1x128 (dst);
2186
2187     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2188                                        a, expand_alpha_1x128 (s)));
2189     __m128i dest      = pix_multiply_1x128 (s, a);
2190     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2191
2192     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2193                                                 &alpha_dst,
2194                                                 &dest,
2195                                                 &alpha_src));
2196 }
2197
2198 static void
2199 sse2_combine_xor_ca (pixman_implementation_t *imp,
2200                      pixman_op_t              op,
2201                      uint32_t *               pd,
2202                      const uint32_t *         ps,
2203                      const uint32_t *         pm,
2204                      int                      w)
2205 {
2206     uint32_t s, m, d;
2207
2208     __m128i xmm_src_lo, xmm_src_hi;
2209     __m128i xmm_dst_lo, xmm_dst_hi;
2210     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2211     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2212     __m128i xmm_mask_lo, xmm_mask_hi;
2213
2214     while (w && (unsigned long)pd & 15)
2215     {
2216         s = *ps++;
2217         m = *pm++;
2218         d = *pd;
2219
2220         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2221         w--;
2222     }
2223
2224     while (w >= 4)
2225     {
2226         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2227         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2228         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2229
2230         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2231         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2232         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2233
2234         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2235                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2236         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2237                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2238
2239         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2240                             &xmm_mask_lo, &xmm_mask_hi,
2241                             &xmm_src_lo, &xmm_src_hi);
2242         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2243                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2244                             &xmm_mask_lo, &xmm_mask_hi);
2245
2246         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2247                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2248         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2249                       &xmm_mask_lo, &xmm_mask_hi);
2250
2251         pix_add_multiply_2x128 (
2252             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2253             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2254             &xmm_dst_lo, &xmm_dst_hi);
2255
2256         save_128_aligned (
2257             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2258
2259         ps += 4;
2260         pd += 4;
2261         pm += 4;
2262         w -= 4;
2263     }
2264
2265     while (w)
2266     {
2267         s = *ps++;
2268         m = *pm++;
2269         d = *pd;
2270
2271         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2272         w--;
2273     }
2274 }
2275
2276 static void
2277 sse2_combine_add_ca (pixman_implementation_t *imp,
2278                      pixman_op_t              op,
2279                      uint32_t *               pd,
2280                      const uint32_t *         ps,
2281                      const uint32_t *         pm,
2282                      int                      w)
2283 {
2284     uint32_t s, m, d;
2285
2286     __m128i xmm_src_lo, xmm_src_hi;
2287     __m128i xmm_dst_lo, xmm_dst_hi;
2288     __m128i xmm_mask_lo, xmm_mask_hi;
2289
2290     while (w && (unsigned long)pd & 15)
2291     {
2292         s = *ps++;
2293         m = *pm++;
2294         d = *pd;
2295
2296         *pd++ = pack_1x128_32 (
2297             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2298                                                unpack_32_1x128 (m)),
2299                            unpack_32_1x128 (d)));
2300         w--;
2301     }
2302
2303     while (w >= 4)
2304     {
2305         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2306         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2307         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2308
2309         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2310         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2311         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2312
2313         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2314                             &xmm_mask_lo, &xmm_mask_hi,
2315                             &xmm_src_lo, &xmm_src_hi);
2316
2317         save_128_aligned (
2318             (__m128i*)pd, pack_2x128_128 (
2319                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2320                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2321
2322         ps += 4;
2323         pd += 4;
2324         pm += 4;
2325         w -= 4;
2326     }
2327
2328     while (w)
2329     {
2330         s = *ps++;
2331         m = *pm++;
2332         d = *pd;
2333
2334         *pd++ = pack_1x128_32 (
2335             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2336                                                unpack_32_1x128 (m)),
2337                            unpack_32_1x128 (d)));
2338         w--;
2339     }
2340 }
2341
2342 static force_inline __m128i
2343 create_mask_16_128 (uint16_t mask)
2344 {
2345     return _mm_set1_epi16 (mask);
2346 }
2347
2348 /* Work around a code generation bug in Sun Studio 12. */
2349 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2350 # define create_mask_2x32_128(mask0, mask1)                             \
2351     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2352 #else
2353 static force_inline __m128i
2354 create_mask_2x32_128 (uint32_t mask0,
2355                       uint32_t mask1)
2356 {
2357     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2358 }
2359 #endif
2360
2361 static void
2362 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2363                             pixman_op_t              op,
2364                             pixman_image_t *         src_image,
2365                             pixman_image_t *         mask_image,
2366                             pixman_image_t *         dst_image,
2367                             int32_t                  src_x,
2368                             int32_t                  src_y,
2369                             int32_t                  mask_x,
2370                             int32_t                  mask_y,
2371                             int32_t                  dest_x,
2372                             int32_t                  dest_y,
2373                             int32_t                  width,
2374                             int32_t                  height)
2375 {
2376     uint32_t src;
2377     uint32_t    *dst_line, *dst, d;
2378     int32_t w;
2379     int dst_stride;
2380     __m128i xmm_src, xmm_alpha;
2381     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2382
2383     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2384
2385     if (src == 0)
2386         return;
2387
2388     PIXMAN_IMAGE_GET_LINE (
2389         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2390
2391     xmm_src = expand_pixel_32_1x128 (src);
2392     xmm_alpha = expand_alpha_1x128 (xmm_src);
2393
2394     while (height--)
2395     {
2396         dst = dst_line;
2397
2398         dst_line += dst_stride;
2399         w = width;
2400
2401         while (w && (unsigned long)dst & 15)
2402         {
2403             d = *dst;
2404             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2405                                                 xmm_alpha,
2406                                                 unpack_32_1x128 (d)));
2407             w--;
2408         }
2409
2410         while (w >= 4)
2411         {
2412             xmm_dst = load_128_aligned ((__m128i*)dst);
2413
2414             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2415
2416             over_2x128 (&xmm_src, &xmm_src,
2417                         &xmm_alpha, &xmm_alpha,
2418                         &xmm_dst_lo, &xmm_dst_hi);
2419
2420             /* rebuid the 4 pixel data and save*/
2421             save_128_aligned (
2422                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2423
2424             w -= 4;
2425             dst += 4;
2426         }
2427
2428         while (w)
2429         {
2430             d = *dst;
2431             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2432                                                 xmm_alpha,
2433                                                 unpack_32_1x128 (d)));
2434             w--;
2435         }
2436
2437     }
2438 }
2439
2440 static void
2441 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2442                             pixman_op_t              op,
2443                             pixman_image_t *         src_image,
2444                             pixman_image_t *         mask_image,
2445                             pixman_image_t *         dst_image,
2446                             int32_t                  src_x,
2447                             int32_t                  src_y,
2448                             int32_t                  mask_x,
2449                             int32_t                  mask_y,
2450                             int32_t                  dest_x,
2451                             int32_t                  dest_y,
2452                             int32_t                  width,
2453                             int32_t                  height)
2454 {
2455     uint32_t src;
2456     uint16_t    *dst_line, *dst, d;
2457     int32_t w;
2458     int dst_stride;
2459     __m128i xmm_src, xmm_alpha;
2460     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2461
2462     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2463
2464     if (src == 0)
2465         return;
2466
2467     PIXMAN_IMAGE_GET_LINE (
2468         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2469
2470     xmm_src = expand_pixel_32_1x128 (src);
2471     xmm_alpha = expand_alpha_1x128 (xmm_src);
2472
2473     while (height--)
2474     {
2475         dst = dst_line;
2476
2477         dst_line += dst_stride;
2478         w = width;
2479
2480         while (w && (unsigned long)dst & 15)
2481         {
2482             d = *dst;
2483
2484             *dst++ = pack_565_32_16 (
2485                 pack_1x128_32 (over_1x128 (xmm_src,
2486                                            xmm_alpha,
2487                                            expand565_16_1x128 (d))));
2488             w--;
2489         }
2490
2491         while (w >= 8)
2492         {
2493             xmm_dst = load_128_aligned ((__m128i*)dst);
2494
2495             unpack_565_128_4x128 (xmm_dst,
2496                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2497
2498             over_2x128 (&xmm_src, &xmm_src,
2499                         &xmm_alpha, &xmm_alpha,
2500                         &xmm_dst0, &xmm_dst1);
2501             over_2x128 (&xmm_src, &xmm_src,
2502                         &xmm_alpha, &xmm_alpha,
2503                         &xmm_dst2, &xmm_dst3);
2504
2505             xmm_dst = pack_565_4x128_128 (
2506                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507
2508             save_128_aligned ((__m128i*)dst, xmm_dst);
2509
2510             dst += 8;
2511             w -= 8;
2512         }
2513
2514         while (w--)
2515         {
2516             d = *dst;
2517             *dst++ = pack_565_32_16 (
2518                 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2519                                            expand565_16_1x128 (d))));
2520         }
2521     }
2522
2523 }
2524
2525 static void
2526 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2527                                    pixman_op_t              op,
2528                                    pixman_image_t *         src_image,
2529                                    pixman_image_t *         mask_image,
2530                                    pixman_image_t *         dst_image,
2531                                    int32_t                  src_x,
2532                                    int32_t                  src_y,
2533                                    int32_t                  mask_x,
2534                                    int32_t                  mask_y,
2535                                    int32_t                  dest_x,
2536                                    int32_t                  dest_y,
2537                                    int32_t                  width,
2538                                    int32_t                  height)
2539 {
2540     uint32_t src, srca;
2541     uint32_t    *dst_line, d;
2542     uint32_t    *mask_line, m;
2543     uint32_t pack_cmp;
2544     int dst_stride, mask_stride;
2545
2546     __m128i xmm_src, xmm_alpha;
2547     __m128i xmm_dst;
2548     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2549
2550     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2551
2552     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2553     srca = src >> 24;
2554
2555     if (src == 0)
2556         return;
2557
2558     PIXMAN_IMAGE_GET_LINE (
2559         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2560     PIXMAN_IMAGE_GET_LINE (
2561         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2562
2563     xmm_src = _mm_unpacklo_epi8 (
2564         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2565     xmm_alpha = expand_alpha_1x128 (xmm_src);
2566     mmx_src   = xmm_src;
2567     mmx_alpha = xmm_alpha;
2568
2569     while (height--)
2570     {
2571         int w = width;
2572         const uint32_t *pm = (uint32_t *)mask_line;
2573         uint32_t *pd = (uint32_t *)dst_line;
2574
2575         dst_line += dst_stride;
2576         mask_line += mask_stride;
2577
2578         while (w && (unsigned long)pd & 15)
2579         {
2580             m = *pm++;
2581
2582             if (m)
2583             {
2584                 d = *pd;
2585
2586                 mmx_mask = unpack_32_1x128 (m);
2587                 mmx_dest = unpack_32_1x128 (d);
2588
2589                 *pd = pack_1x128_32 (
2590                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2591             }
2592
2593             pd++;
2594             w--;
2595         }
2596
2597         while (w >= 4)
2598         {
2599             xmm_mask = load_128_unaligned ((__m128i*)pm);
2600
2601             pack_cmp =
2602                 _mm_movemask_epi8 (
2603                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2604
2605             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2606             if (pack_cmp != 0xffff)
2607             {
2608                 xmm_dst = load_128_aligned ((__m128i*)pd);
2609
2610                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2611
2612                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2613                                     &xmm_mask_lo, &xmm_mask_hi,
2614                                     &xmm_mask_lo, &xmm_mask_hi);
2615                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2616
2617                 save_128_aligned (
2618                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2619             }
2620
2621             pd += 4;
2622             pm += 4;
2623             w -= 4;
2624         }
2625
2626         while (w)
2627         {
2628             m = *pm++;
2629
2630             if (m)
2631             {
2632                 d = *pd;
2633
2634                 mmx_mask = unpack_32_1x128 (m);
2635                 mmx_dest = unpack_32_1x128 (d);
2636
2637                 *pd = pack_1x128_32 (
2638                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2639             }
2640
2641             pd++;
2642             w--;
2643         }
2644     }
2645
2646 }
2647
2648 static void
2649 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2650                                     pixman_op_t              op,
2651                                     pixman_image_t *         src_image,
2652                                     pixman_image_t *         mask_image,
2653                                     pixman_image_t *         dst_image,
2654                                     int32_t                  src_x,
2655                                     int32_t                  src_y,
2656                                     int32_t                  mask_x,
2657                                     int32_t                  mask_y,
2658                                     int32_t                  dest_x,
2659                                     int32_t                  dest_y,
2660                                     int32_t                  width,
2661                                     int32_t                  height)
2662 {
2663     uint32_t src;
2664     uint32_t    *dst_line, d;
2665     uint32_t    *mask_line, m;
2666     uint32_t pack_cmp;
2667     int dst_stride, mask_stride;
2668
2669     __m128i xmm_src, xmm_alpha;
2670     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2671     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2672
2673     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2674
2675     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2676
2677     if (src == 0)
2678         return;
2679
2680     PIXMAN_IMAGE_GET_LINE (
2681         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2682     PIXMAN_IMAGE_GET_LINE (
2683         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2684
2685     xmm_src = _mm_unpacklo_epi8 (
2686         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2687     xmm_alpha = expand_alpha_1x128 (xmm_src);
2688     mmx_src   = xmm_src;
2689     mmx_alpha = xmm_alpha;
2690
2691     while (height--)
2692     {
2693         int w = width;
2694         const uint32_t *pm = (uint32_t *)mask_line;
2695         uint32_t *pd = (uint32_t *)dst_line;
2696
2697         dst_line += dst_stride;
2698         mask_line += mask_stride;
2699
2700         while (w && (unsigned long)pd & 15)
2701         {
2702             m = *pm++;
2703
2704             if (m)
2705             {
2706                 d = *pd;
2707                 mmx_mask = unpack_32_1x128 (m);
2708                 mmx_dest = unpack_32_1x128 (d);
2709
2710                 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2711                                                   &mmx_alpha,
2712                                                   &mmx_mask,
2713                                                   &mmx_dest));
2714             }
2715
2716             pd++;
2717             w--;
2718         }
2719
2720         while (w >= 4)
2721         {
2722             xmm_mask = load_128_unaligned ((__m128i*)pm);
2723
2724             pack_cmp =
2725                 _mm_movemask_epi8 (
2726                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2727
2728             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2729             if (pack_cmp != 0xffff)
2730             {
2731                 xmm_dst = load_128_aligned ((__m128i*)pd);
2732
2733                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2734                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2735
2736                 in_over_2x128 (&xmm_src, &xmm_src,
2737                                &xmm_alpha, &xmm_alpha,
2738                                &xmm_mask_lo, &xmm_mask_hi,
2739                                &xmm_dst_lo, &xmm_dst_hi);
2740
2741                 save_128_aligned (
2742                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2743             }
2744
2745             pd += 4;
2746             pm += 4;
2747             w -= 4;
2748         }
2749
2750         while (w)
2751         {
2752             m = *pm++;
2753
2754             if (m)
2755             {
2756                 d = *pd;
2757                 mmx_mask = unpack_32_1x128 (m);
2758                 mmx_dest = unpack_32_1x128 (d);
2759
2760                 *pd = pack_1x128_32 (
2761                     in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2762             }
2763
2764             pd++;
2765             w--;
2766         }
2767     }
2768
2769 }
2770
2771 static void
2772 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2773                                  pixman_op_t              op,
2774                                  pixman_image_t *         src_image,
2775                                  pixman_image_t *         mask_image,
2776                                  pixman_image_t *         dst_image,
2777                                  int32_t                  src_x,
2778                                  int32_t                  src_y,
2779                                  int32_t                  mask_x,
2780                                  int32_t                  mask_y,
2781                                  int32_t                  dest_x,
2782                                  int32_t                  dest_y,
2783                                  int32_t                  width,
2784                                  int32_t                  height)
2785 {
2786     uint32_t    *dst_line, *dst;
2787     uint32_t    *src_line, *src;
2788     uint32_t mask;
2789     int32_t w;
2790     int dst_stride, src_stride;
2791
2792     __m128i xmm_mask;
2793     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2794     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2795     __m128i xmm_alpha_lo, xmm_alpha_hi;
2796
2797     PIXMAN_IMAGE_GET_LINE (
2798         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2799     PIXMAN_IMAGE_GET_LINE (
2800         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2801
2802     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2803
2804     xmm_mask = create_mask_16_128 (mask >> 24);
2805
2806     while (height--)
2807     {
2808         dst = dst_line;
2809         dst_line += dst_stride;
2810         src = src_line;
2811         src_line += src_stride;
2812         w = width;
2813
2814         while (w && (unsigned long)dst & 15)
2815         {
2816             uint32_t s = *src++;
2817
2818             if (s)
2819             {
2820                 uint32_t d = *dst;
2821                 
2822                 __m128i ms = unpack_32_1x128 (s);
2823                 __m128i alpha    = expand_alpha_1x128 (ms);
2824                 __m128i dest     = xmm_mask;
2825                 __m128i alpha_dst = unpack_32_1x128 (d);
2826                 
2827                 *dst = pack_1x128_32 (
2828                     in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2829             }
2830             dst++;
2831             w--;
2832         }
2833
2834         while (w >= 4)
2835         {
2836             xmm_src = load_128_unaligned ((__m128i*)src);
2837
2838             if (!is_zero (xmm_src))
2839             {
2840                 xmm_dst = load_128_aligned ((__m128i*)dst);
2841                 
2842                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2843                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2844                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2845                                     &xmm_alpha_lo, &xmm_alpha_hi);
2846                 
2847                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2848                                &xmm_alpha_lo, &xmm_alpha_hi,
2849                                &xmm_mask, &xmm_mask,
2850                                &xmm_dst_lo, &xmm_dst_hi);
2851                 
2852                 save_128_aligned (
2853                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2854             }
2855                 
2856             dst += 4;
2857             src += 4;
2858             w -= 4;
2859         }
2860
2861         while (w)
2862         {
2863             uint32_t s = *src++;
2864
2865             if (s)
2866             {
2867                 uint32_t d = *dst;
2868                 
2869                 __m128i ms = unpack_32_1x128 (s);
2870                 __m128i alpha = expand_alpha_1x128 (ms);
2871                 __m128i mask  = xmm_mask;
2872                 __m128i dest  = unpack_32_1x128 (d);
2873                 
2874                 *dst = pack_1x128_32 (
2875                     in_over_1x128 (&ms, &alpha, &mask, &dest));
2876             }
2877
2878             dst++;
2879             w--;
2880         }
2881     }
2882
2883 }
2884
2885 static void
2886 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2887                               pixman_op_t              op,
2888                               pixman_image_t *         src_image,
2889                               pixman_image_t *         mask_image,
2890                               pixman_image_t *         dst_image,
2891                               int32_t                  src_x,
2892                               int32_t                  src_y,
2893                               int32_t                  mask_x,
2894                               int32_t                  mask_y,
2895                               int32_t                  dest_x,
2896                               int32_t                  dest_y,
2897                               int32_t                  width,
2898                               int32_t                  height)
2899 {
2900     uint32_t    *dst_line, *dst;
2901     uint32_t    *src_line, *src;
2902     int32_t w;
2903     int dst_stride, src_stride;
2904
2905
2906     PIXMAN_IMAGE_GET_LINE (
2907         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2908     PIXMAN_IMAGE_GET_LINE (
2909         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2910
2911     while (height--)
2912     {
2913         dst = dst_line;
2914         dst_line += dst_stride;
2915         src = src_line;
2916         src_line += src_stride;
2917         w = width;
2918
2919         while (w && (unsigned long)dst & 15)
2920         {
2921             *dst++ = *src++ | 0xff000000;
2922             w--;
2923         }
2924
2925         while (w >= 16)
2926         {
2927             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2928             
2929             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2930             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2931             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2932             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2933             
2934             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2935             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2936             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2937             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2938             
2939             dst += 16;
2940             src += 16;
2941             w -= 16;
2942         }
2943
2944         while (w)
2945         {
2946             *dst++ = *src++ | 0xff000000;
2947             w--;
2948         }
2949     }
2950
2951 }
2952
2953 static void
2954 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2955                                  pixman_op_t              op,
2956                                  pixman_image_t *         src_image,
2957                                  pixman_image_t *         mask_image,
2958                                  pixman_image_t *         dst_image,
2959                                  int32_t                  src_x,
2960                                  int32_t                  src_y,
2961                                  int32_t                  mask_x,
2962                                  int32_t                  mask_y,
2963                                  int32_t                  dest_x,
2964                                  int32_t                  dest_y,
2965                                  int32_t                  width,
2966                                  int32_t                  height)
2967 {
2968     uint32_t    *dst_line, *dst;
2969     uint32_t    *src_line, *src;
2970     uint32_t mask;
2971     int dst_stride, src_stride;
2972     int32_t w;
2973
2974     __m128i xmm_mask, xmm_alpha;
2975     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2976     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2977
2978     PIXMAN_IMAGE_GET_LINE (
2979         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2980     PIXMAN_IMAGE_GET_LINE (
2981         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2982
2983     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2984
2985     xmm_mask = create_mask_16_128 (mask >> 24);
2986     xmm_alpha = mask_00ff;
2987
2988     while (height--)
2989     {
2990         dst = dst_line;
2991         dst_line += dst_stride;
2992         src = src_line;
2993         src_line += src_stride;
2994         w = width;
2995
2996         while (w && (unsigned long)dst & 15)
2997         {
2998             uint32_t s = (*src++) | 0xff000000;
2999             uint32_t d = *dst;
3000
3001             __m128i src   = unpack_32_1x128 (s);
3002             __m128i alpha = xmm_alpha;
3003             __m128i mask  = xmm_mask;
3004             __m128i dest  = unpack_32_1x128 (d);
3005
3006             *dst++ = pack_1x128_32 (
3007                 in_over_1x128 (&src, &alpha, &mask, &dest));
3008
3009             w--;
3010         }
3011
3012         while (w >= 4)
3013         {
3014             xmm_src = _mm_or_si128 (
3015                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3016             xmm_dst = load_128_aligned ((__m128i*)dst);
3017
3018             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3019             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3020
3021             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3022                            &xmm_alpha, &xmm_alpha,
3023                            &xmm_mask, &xmm_mask,
3024                            &xmm_dst_lo, &xmm_dst_hi);
3025
3026             save_128_aligned (
3027                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3028
3029             dst += 4;
3030             src += 4;
3031             w -= 4;
3032
3033         }
3034
3035         while (w)
3036         {
3037             uint32_t s = (*src++) | 0xff000000;
3038             uint32_t d = *dst;
3039
3040             __m128i src  = unpack_32_1x128 (s);
3041             __m128i alpha = xmm_alpha;
3042             __m128i mask  = xmm_mask;
3043             __m128i dest  = unpack_32_1x128 (d);
3044
3045             *dst++ = pack_1x128_32 (
3046                 in_over_1x128 (&src, &alpha, &mask, &dest));
3047
3048             w--;
3049         }
3050     }
3051
3052 }
3053
3054 static void
3055 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3056                                pixman_op_t              op,
3057                                pixman_image_t *         src_image,
3058                                pixman_image_t *         mask_image,
3059                                pixman_image_t *         dst_image,
3060                                int32_t                  src_x,
3061                                int32_t                  src_y,
3062                                int32_t                  mask_x,
3063                                int32_t                  mask_y,
3064                                int32_t                  dest_x,
3065                                int32_t                  dest_y,
3066                                int32_t                  width,
3067                                int32_t                  height)
3068 {
3069     int dst_stride, src_stride;
3070     uint32_t    *dst_line, *dst;
3071     uint32_t    *src_line, *src;
3072
3073     PIXMAN_IMAGE_GET_LINE (
3074         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3075     PIXMAN_IMAGE_GET_LINE (
3076         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3077
3078     dst = dst_line;
3079     src = src_line;
3080
3081     while (height--)
3082     {
3083         sse2_combine_over_u (imp, op, dst, src, NULL, width);
3084
3085         dst += dst_stride;
3086         src += src_stride;
3087     }
3088 }
3089
3090 static force_inline uint16_t
3091 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3092 {
3093     __m128i ms;
3094
3095     ms = unpack_32_1x128 (src);
3096     return pack_565_32_16 (
3097         pack_1x128_32 (
3098             over_1x128 (
3099                 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3100 }
3101
3102 static void
3103 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3104                                pixman_op_t              op,
3105                                pixman_image_t *         src_image,
3106                                pixman_image_t *         mask_image,
3107                                pixman_image_t *         dst_image,
3108                                int32_t                  src_x,
3109                                int32_t                  src_y,
3110                                int32_t                  mask_x,
3111                                int32_t                  mask_y,
3112                                int32_t                  dest_x,
3113                                int32_t                  dest_y,
3114                                int32_t                  width,
3115                                int32_t                  height)
3116 {
3117     uint16_t    *dst_line, *dst, d;
3118     uint32_t    *src_line, *src, s;
3119     int dst_stride, src_stride;
3120     int32_t w;
3121
3122     __m128i xmm_alpha_lo, xmm_alpha_hi;
3123     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3124     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3125
3126     PIXMAN_IMAGE_GET_LINE (
3127         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3128     PIXMAN_IMAGE_GET_LINE (
3129         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3130
3131     while (height--)
3132     {
3133         dst = dst_line;
3134         src = src_line;
3135
3136         dst_line += dst_stride;
3137         src_line += src_stride;
3138         w = width;
3139
3140         /* Align dst on a 16-byte boundary */
3141         while (w &&
3142                ((unsigned long)dst & 15))
3143         {
3144             s = *src++;
3145             d = *dst;
3146
3147             *dst++ = composite_over_8888_0565pixel (s, d);
3148             w--;
3149         }
3150
3151         /* It's a 8 pixel loop */
3152         while (w >= 8)
3153         {
3154             /* I'm loading unaligned because I'm not sure
3155              * about the address alignment.
3156              */
3157             xmm_src = load_128_unaligned ((__m128i*) src);
3158             xmm_dst = load_128_aligned ((__m128i*) dst);
3159
3160             /* Unpacking */
3161             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3162             unpack_565_128_4x128 (xmm_dst,
3163                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3164             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3165                                 &xmm_alpha_lo, &xmm_alpha_hi);
3166
3167             /* I'm loading next 4 pixels from memory
3168              * before to optimze the memory read.
3169              */
3170             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3171
3172             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3173                         &xmm_alpha_lo, &xmm_alpha_hi,
3174                         &xmm_dst0, &xmm_dst1);
3175
3176             /* Unpacking */
3177             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3178             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3179                                 &xmm_alpha_lo, &xmm_alpha_hi);
3180
3181             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3182                         &xmm_alpha_lo, &xmm_alpha_hi,
3183                         &xmm_dst2, &xmm_dst3);
3184
3185             save_128_aligned (
3186                 (__m128i*)dst, pack_565_4x128_128 (
3187                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3188
3189             w -= 8;
3190             dst += 8;
3191             src += 8;
3192         }
3193
3194         while (w--)
3195         {
3196             s = *src++;
3197             d = *dst;
3198
3199             *dst++ = composite_over_8888_0565pixel (s, d);
3200         }
3201     }
3202
3203 }
3204
3205 static void
3206 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3207                               pixman_op_t              op,
3208                               pixman_image_t *         src_image,
3209                               pixman_image_t *         mask_image,
3210                               pixman_image_t *         dst_image,
3211                               int32_t                  src_x,
3212                               int32_t                  src_y,
3213                               int32_t                  mask_x,
3214                               int32_t                  mask_y,
3215                               int32_t                  dest_x,
3216                               int32_t                  dest_y,
3217                               int32_t                  width,
3218                               int32_t                  height)
3219 {
3220     uint32_t src, srca;
3221     uint32_t *dst_line, *dst;
3222     uint8_t *mask_line, *mask;
3223     int dst_stride, mask_stride;
3224     int32_t w;
3225     uint32_t m, d;
3226
3227     __m128i xmm_src, xmm_alpha, xmm_def;
3228     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3229     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3230
3231     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3232
3233     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3234
3235     srca = src >> 24;
3236     if (src == 0)
3237         return;
3238
3239     PIXMAN_IMAGE_GET_LINE (
3240         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3241     PIXMAN_IMAGE_GET_LINE (
3242         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3243
3244     xmm_def = create_mask_2x32_128 (src, src);
3245     xmm_src = expand_pixel_32_1x128 (src);
3246     xmm_alpha = expand_alpha_1x128 (xmm_src);
3247     mmx_src   = xmm_src;
3248     mmx_alpha = xmm_alpha;
3249
3250     while (height--)
3251     {
3252         dst = dst_line;
3253         dst_line += dst_stride;
3254         mask = mask_line;
3255         mask_line += mask_stride;
3256         w = width;
3257
3258         while (w && (unsigned long)dst & 15)
3259         {
3260             uint8_t m = *mask++;
3261
3262             if (m)
3263             {
3264                 d = *dst;
3265                 mmx_mask = expand_pixel_8_1x128 (m);
3266                 mmx_dest = unpack_32_1x128 (d);
3267
3268                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3269                                                    &mmx_alpha,
3270                                                    &mmx_mask,
3271                                                    &mmx_dest));
3272             }
3273
3274             w--;
3275             dst++;
3276         }
3277
3278         while (w >= 4)
3279         {
3280             m = *((uint32_t*)mask);
3281
3282             if (srca == 0xff && m == 0xffffffff)
3283             {
3284                 save_128_aligned ((__m128i*)dst, xmm_def);
3285             }
3286             else if (m)
3287             {
3288                 xmm_dst = load_128_aligned ((__m128i*) dst);
3289                 xmm_mask = unpack_32_1x128 (m);
3290                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3291
3292                 /* Unpacking */
3293                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3294                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3295
3296                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3297                                         &xmm_mask_lo, &xmm_mask_hi);
3298
3299                 in_over_2x128 (&xmm_src, &xmm_src,
3300                                &xmm_alpha, &xmm_alpha,
3301                                &xmm_mask_lo, &xmm_mask_hi,
3302                                &xmm_dst_lo, &xmm_dst_hi);
3303
3304                 save_128_aligned (
3305                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3306             }
3307
3308             w -= 4;
3309             dst += 4;
3310             mask += 4;
3311         }
3312
3313         while (w)
3314         {
3315             uint8_t m = *mask++;
3316
3317             if (m)
3318             {
3319                 d = *dst;
3320                 mmx_mask = expand_pixel_8_1x128 (m);
3321                 mmx_dest = unpack_32_1x128 (d);
3322
3323                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3324                                                    &mmx_alpha,
3325                                                    &mmx_mask,
3326                                                    &mmx_dest));
3327             }
3328
3329             w--;
3330             dst++;
3331         }
3332     }
3333
3334 }
3335
3336 pixman_bool_t
3337 pixman_fill_sse2 (uint32_t *bits,
3338                   int       stride,
3339                   int       bpp,
3340                   int       x,
3341                   int       y,
3342                   int       width,
3343                   int       height,
3344                   uint32_t  data)
3345 {
3346     uint32_t byte_width;
3347     uint8_t         *byte_line;
3348
3349     __m128i xmm_def;
3350
3351     if (bpp == 8)
3352     {
3353         uint8_t b;
3354         uint16_t w;
3355
3356         stride = stride * (int) sizeof (uint32_t) / 1;
3357         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3358         byte_width = width;
3359         stride *= 1;
3360
3361         b = data & 0xff;
3362         w = (b << 8) | b;
3363         data = (w << 16) | w;
3364     }
3365     else if (bpp == 16)
3366     {
3367         stride = stride * (int) sizeof (uint32_t) / 2;
3368         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3369         byte_width = 2 * width;
3370         stride *= 2;
3371
3372         data = (data & 0xffff) * 0x00010001;
3373     }
3374     else if (bpp == 32)
3375     {
3376         stride = stride * (int) sizeof (uint32_t) / 4;
3377         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3378         byte_width = 4 * width;
3379         stride *= 4;
3380     }
3381     else
3382     {
3383         return FALSE;
3384     }
3385
3386     xmm_def = create_mask_2x32_128 (data, data);
3387
3388     while (height--)
3389     {
3390         int w;
3391         uint8_t *d = byte_line;
3392         byte_line += stride;
3393         w = byte_width;
3394
3395         while (w >= 1 && ((unsigned long)d & 1))
3396         {
3397             *(uint8_t *)d = data;
3398             w -= 1;
3399             d += 1;
3400         }
3401
3402         while (w >= 2 && ((unsigned long)d & 3))
3403         {
3404             *(uint16_t *)d = data;
3405             w -= 2;
3406             d += 2;
3407         }
3408
3409         while (w >= 4 && ((unsigned long)d & 15))
3410         {
3411             *(uint32_t *)d = data;
3412
3413             w -= 4;
3414             d += 4;
3415         }
3416
3417         while (w >= 128)
3418         {
3419             save_128_aligned ((__m128i*)(d),     xmm_def);
3420             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3421             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3422             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3423             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3424             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3425             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3426             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3427
3428             d += 128;
3429             w -= 128;
3430         }
3431
3432         if (w >= 64)
3433         {
3434             save_128_aligned ((__m128i*)(d),     xmm_def);
3435             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3436             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3437             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3438
3439             d += 64;
3440             w -= 64;
3441         }
3442
3443         if (w >= 32)
3444         {
3445             save_128_aligned ((__m128i*)(d),     xmm_def);
3446             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3447
3448             d += 32;
3449             w -= 32;
3450         }
3451
3452         if (w >= 16)
3453         {
3454             save_128_aligned ((__m128i*)(d),     xmm_def);
3455
3456             d += 16;
3457             w -= 16;
3458         }
3459
3460         while (w >= 4)
3461         {
3462             *(uint32_t *)d = data;
3463
3464             w -= 4;
3465             d += 4;
3466         }
3467
3468         if (w >= 2)
3469         {
3470             *(uint16_t *)d = data;
3471             w -= 2;
3472             d += 2;
3473         }
3474
3475         if (w >= 1)
3476         {
3477             *(uint8_t *)d = data;
3478             w -= 1;
3479             d += 1;
3480         }
3481     }
3482
3483     return TRUE;
3484 }
3485
3486 static void
3487 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3488                              pixman_op_t              op,
3489                              pixman_image_t *         src_image,
3490                              pixman_image_t *         mask_image,
3491                              pixman_image_t *         dst_image,
3492                              int32_t                  src_x,
3493                              int32_t                  src_y,
3494                              int32_t                  mask_x,
3495                              int32_t                  mask_y,
3496                              int32_t                  dest_x,
3497                              int32_t                  dest_y,
3498                              int32_t                  width,
3499                              int32_t                  height)
3500 {
3501     uint32_t src, srca;
3502     uint32_t    *dst_line, *dst;
3503     uint8_t     *mask_line, *mask;
3504     int dst_stride, mask_stride;
3505     int32_t w;
3506     uint32_t m;
3507
3508     __m128i xmm_src, xmm_def;
3509     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3510
3511     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3512
3513     srca = src >> 24;
3514     if (src == 0)
3515     {
3516         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3517                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
3518                           dest_x, dest_y, width, height, 0);
3519         return;
3520     }
3521
3522     PIXMAN_IMAGE_GET_LINE (
3523         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3524     PIXMAN_IMAGE_GET_LINE (
3525         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3526
3527     xmm_def = create_mask_2x32_128 (src, src);
3528     xmm_src = expand_pixel_32_1x128 (src);
3529
3530     while (height--)
3531     {
3532         dst = dst_line;
3533         dst_line += dst_stride;
3534         mask = mask_line;
3535         mask_line += mask_stride;
3536         w = width;
3537
3538         while (w && (unsigned long)dst & 15)
3539         {
3540             uint8_t m = *mask++;
3541
3542             if (m)
3543             {
3544                 *dst = pack_1x128_32 (
3545                     pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3546             }
3547             else
3548             {
3549                 *dst = 0;
3550             }
3551
3552             w--;
3553             dst++;
3554         }
3555
3556         while (w >= 4)
3557         {
3558             m = *((uint32_t*)mask);
3559
3560             if (srca == 0xff && m == 0xffffffff)
3561             {
3562                 save_128_aligned ((__m128i*)dst, xmm_def);
3563             }
3564             else if (m)
3565             {
3566                 xmm_mask = unpack_32_1x128 (m);
3567                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3568
3569                 /* Unpacking */
3570                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3571
3572                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3573                                         &xmm_mask_lo, &xmm_mask_hi);
3574
3575                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3576                                     &xmm_mask_lo, &xmm_mask_hi,
3577                                     &xmm_mask_lo, &xmm_mask_hi);
3578
3579                 save_128_aligned (
3580                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3581             }
3582             else
3583             {
3584                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3585             }
3586
3587             w -= 4;
3588             dst += 4;
3589             mask += 4;
3590         }
3591
3592         while (w)
3593         {
3594             uint8_t m = *mask++;
3595
3596             if (m)
3597             {
3598                 *dst = pack_1x128_32 (
3599                     pix_multiply_1x128 (
3600                         xmm_src, expand_pixel_8_1x128 (m)));
3601             }
3602             else
3603             {
3604                 *dst = 0;
3605             }
3606
3607             w--;
3608             dst++;
3609         }
3610     }
3611
3612 }
3613
3614 static void
3615 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3616                               pixman_op_t              op,
3617                               pixman_image_t *         src_image,
3618                               pixman_image_t *         mask_image,
3619                               pixman_image_t *         dst_image,
3620                               int32_t                  src_x,
3621                               int32_t                  src_y,
3622                               int32_t                  mask_x,
3623                               int32_t                  mask_y,
3624                               int32_t                  dest_x,
3625                               int32_t                  dest_y,
3626                               int32_t                  width,
3627                               int32_t                  height)
3628 {
3629     uint32_t src, srca;
3630     uint16_t    *dst_line, *dst, d;
3631     uint8_t     *mask_line, *mask;
3632     int dst_stride, mask_stride;
3633     int32_t w;
3634     uint32_t m;
3635     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3636
3637     __m128i xmm_src, xmm_alpha;
3638     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3639     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3640
3641     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3642
3643     srca = src >> 24;
3644     if (src == 0)
3645         return;
3646
3647     PIXMAN_IMAGE_GET_LINE (
3648         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3649     PIXMAN_IMAGE_GET_LINE (
3650         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3651
3652     xmm_src = expand_pixel_32_1x128 (src);
3653     xmm_alpha = expand_alpha_1x128 (xmm_src);
3654     mmx_src = xmm_src;
3655     mmx_alpha = xmm_alpha;
3656
3657     while (height--)
3658     {
3659         dst = dst_line;
3660         dst_line += dst_stride;
3661         mask = mask_line;
3662         mask_line += mask_stride;
3663         w = width;
3664
3665         while (w && (unsigned long)dst & 15)
3666         {
3667             m = *mask++;
3668
3669             if (m)
3670             {
3671                 d = *dst;
3672                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3673                 mmx_dest = expand565_16_1x128 (d);
3674
3675                 *dst = pack_565_32_16 (
3676                     pack_1x128_32 (
3677                         in_over_1x128 (
3678                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3679             }
3680
3681             w--;
3682             dst++;
3683         }
3684
3685         while (w >= 8)
3686         {
3687             xmm_dst = load_128_aligned ((__m128i*) dst);
3688             unpack_565_128_4x128 (xmm_dst,
3689                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3690
3691             m = *((uint32_t*)mask);
3692             mask += 4;
3693
3694             if (m)
3695             {
3696                 xmm_mask = unpack_32_1x128 (m);
3697                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3698
3699                 /* Unpacking */
3700                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3701
3702                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3703                                         &xmm_mask_lo, &xmm_mask_hi);
3704
3705                 in_over_2x128 (&xmm_src, &xmm_src,
3706                                &xmm_alpha, &xmm_alpha,
3707                                &xmm_mask_lo, &xmm_mask_hi,
3708                                &xmm_dst0, &xmm_dst1);
3709             }
3710
3711             m = *((uint32_t*)mask);
3712             mask += 4;
3713
3714             if (m)
3715             {
3716                 xmm_mask = unpack_32_1x128 (m);
3717                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3718
3719                 /* Unpacking */
3720                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3721
3722                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3723                                         &xmm_mask_lo, &xmm_mask_hi);
3724                 in_over_2x128 (&xmm_src, &xmm_src,
3725                                &xmm_alpha, &xmm_alpha,
3726                                &xmm_mask_lo, &xmm_mask_hi,
3727                                &xmm_dst2, &xmm_dst3);
3728             }
3729
3730             save_128_aligned (
3731                 (__m128i*)dst, pack_565_4x128_128 (
3732                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3733
3734             w -= 8;
3735             dst += 8;
3736         }
3737
3738         while (w)
3739         {
3740             m = *mask++;
3741
3742             if (m)
3743             {
3744                 d = *dst;
3745                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3746                 mmx_dest = expand565_16_1x128 (d);
3747
3748                 *dst = pack_565_32_16 (
3749                     pack_1x128_32 (
3750                         in_over_1x128 (
3751                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3752             }
3753
3754             w--;
3755             dst++;
3756         }
3757     }
3758
3759 }
3760
3761 static void
3762 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3763                                  pixman_op_t              op,
3764                                  pixman_image_t *         src_image,
3765                                  pixman_image_t *         mask_image,
3766                                  pixman_image_t *         dst_image,
3767                                  int32_t                  src_x,
3768                                  int32_t                  src_y,
3769                                  int32_t                  mask_x,
3770                                  int32_t                  mask_y,
3771                                  int32_t                  dest_x,
3772                                  int32_t                  dest_y,
3773                                  int32_t                  width,
3774                                  int32_t                  height)
3775 {
3776     uint16_t    *dst_line, *dst, d;
3777     uint32_t    *src_line, *src, s;
3778     int dst_stride, src_stride;
3779     int32_t w;
3780     uint32_t opaque, zero;
3781
3782     __m128i ms;
3783     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3784     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3785
3786     PIXMAN_IMAGE_GET_LINE (
3787         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3788     PIXMAN_IMAGE_GET_LINE (
3789         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3790
3791     while (height--)
3792     {
3793         dst = dst_line;
3794         dst_line += dst_stride;
3795         src = src_line;
3796         src_line += src_stride;
3797         w = width;
3798
3799         while (w && (unsigned long)dst & 15)
3800         {
3801             s = *src++;
3802             d = *dst;
3803
3804             ms = unpack_32_1x128 (s);
3805
3806             *dst++ = pack_565_32_16 (
3807                 pack_1x128_32 (
3808                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3809             w--;
3810         }
3811
3812         while (w >= 8)
3813         {
3814             /* First round */
3815             xmm_src = load_128_unaligned ((__m128i*)src);
3816             xmm_dst = load_128_aligned  ((__m128i*)dst);
3817
3818             opaque = is_opaque (xmm_src);
3819             zero = is_zero (xmm_src);
3820
3821             unpack_565_128_4x128 (xmm_dst,
3822                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3823             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3824
3825             /* preload next round*/
3826             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3827
3828             if (opaque)
3829             {
3830                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3831                                      &xmm_dst0, &xmm_dst1);
3832             }
3833             else if (!zero)
3834             {
3835                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3836                                         &xmm_dst0, &xmm_dst1);
3837             }
3838
3839             /* Second round */
3840             opaque = is_opaque (xmm_src);
3841             zero = is_zero (xmm_src);
3842
3843             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3844
3845             if (opaque)
3846             {
3847                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3848                                      &xmm_dst2, &xmm_dst3);
3849             }
3850             else if (!zero)
3851             {
3852                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3853                                         &xmm_dst2, &xmm_dst3);
3854             }
3855
3856             save_128_aligned (
3857                 (__m128i*)dst, pack_565_4x128_128 (
3858                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3859
3860             w -= 8;
3861             src += 8;
3862             dst += 8;
3863         }
3864
3865         while (w)
3866         {
3867             s = *src++;
3868             d = *dst;
3869
3870             ms = unpack_32_1x128 (s);
3871
3872             *dst++ = pack_565_32_16 (
3873                 pack_1x128_32 (
3874                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3875             w--;
3876         }
3877     }
3878
3879 }
3880
3881 static void
3882 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3883                                  pixman_op_t              op,
3884                                  pixman_image_t *         src_image,
3885                                  pixman_image_t *         mask_image,
3886                                  pixman_image_t *         dst_image,
3887                                  int32_t                  src_x,
3888                                  int32_t                  src_y,
3889                                  int32_t                  mask_x,
3890                                  int32_t                  mask_y,
3891                                  int32_t                  dest_x,
3892                                  int32_t                  dest_y,
3893                                  int32_t                  width,
3894                                  int32_t                  height)
3895 {
3896     uint32_t    *dst_line, *dst, d;
3897     uint32_t    *src_line, *src, s;
3898     int dst_stride, src_stride;
3899     int32_t w;
3900     uint32_t opaque, zero;
3901
3902     __m128i xmm_src_lo, xmm_src_hi;
3903     __m128i xmm_dst_lo, xmm_dst_hi;
3904
3905     PIXMAN_IMAGE_GET_LINE (
3906         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3907     PIXMAN_IMAGE_GET_LINE (
3908         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3909
3910     while (height--)
3911     {
3912         dst = dst_line;
3913         dst_line += dst_stride;
3914         src = src_line;
3915         src_line += src_stride;
3916         w = width;
3917
3918         while (w && (unsigned long)dst & 15)
3919         {
3920             s = *src++;
3921             d = *dst;
3922
3923             *dst++ = pack_1x128_32 (
3924                 over_rev_non_pre_1x128 (
3925                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3926
3927             w--;
3928         }
3929
3930         while (w >= 4)
3931         {
3932             xmm_src_hi = load_128_unaligned ((__m128i*)src);
3933
3934             opaque = is_opaque (xmm_src_hi);
3935             zero = is_zero (xmm_src_hi);
3936
3937             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3938
3939             if (opaque)
3940             {
3941                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3942                                      &xmm_dst_lo, &xmm_dst_hi);
3943
3944                 save_128_aligned (
3945                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3946             }
3947             else if (!zero)
3948             {
3949                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
3950
3951                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3952
3953                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3954                                         &xmm_dst_lo, &xmm_dst_hi);
3955
3956                 save_128_aligned (
3957                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3958             }
3959
3960             w -= 4;
3961             dst += 4;
3962             src += 4;
3963         }
3964
3965         while (w)
3966         {
3967             s = *src++;
3968             d = *dst;
3969
3970             *dst++ = pack_1x128_32 (
3971                 over_rev_non_pre_1x128 (
3972                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3973
3974             w--;
3975         }
3976     }
3977
3978 }
3979
3980 static void
3981 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3982                                     pixman_op_t              op,
3983                                     pixman_image_t *         src_image,
3984                                     pixman_image_t *         mask_image,
3985                                     pixman_image_t *         dst_image,
3986                                     int32_t                  src_x,
3987                                     int32_t                  src_y,
3988                                     int32_t                  mask_x,
3989                                     int32_t                  mask_y,
3990                                     int32_t                  dest_x,
3991                                     int32_t                  dest_y,
3992                                     int32_t                  width,
3993                                     int32_t                  height)
3994 {
3995     uint32_t src;
3996     uint16_t    *dst_line, *dst, d;
3997     uint32_t    *mask_line, *mask, m;
3998     int dst_stride, mask_stride;
3999     int w;
4000     uint32_t pack_cmp;
4001
4002     __m128i xmm_src, xmm_alpha;
4003     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4004     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4005
4006     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4007
4008     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4009
4010     if (src == 0)
4011         return;
4012
4013     PIXMAN_IMAGE_GET_LINE (
4014         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4015     PIXMAN_IMAGE_GET_LINE (
4016         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4017
4018     xmm_src = expand_pixel_32_1x128 (src);
4019     xmm_alpha = expand_alpha_1x128 (xmm_src);
4020     mmx_src = xmm_src;
4021     mmx_alpha = xmm_alpha;
4022
4023     while (height--)
4024     {
4025         w = width;
4026         mask = mask_line;
4027         dst = dst_line;
4028         mask_line += mask_stride;
4029         dst_line += dst_stride;
4030
4031         while (w && ((unsigned long)dst & 15))
4032         {
4033             m = *(uint32_t *) mask;
4034
4035             if (m)
4036             {
4037                 d = *dst;
4038                 mmx_mask = unpack_32_1x128 (m);
4039                 mmx_dest = expand565_16_1x128 (d);
4040
4041                 *dst = pack_565_32_16 (
4042                     pack_1x128_32 (
4043                         in_over_1x128 (
4044                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4045             }
4046
4047             w--;
4048             dst++;
4049             mask++;
4050         }
4051
4052         while (w >= 8)
4053         {
4054             /* First round */
4055             xmm_mask = load_128_unaligned ((__m128i*)mask);
4056             xmm_dst = load_128_aligned ((__m128i*)dst);
4057
4058             pack_cmp = _mm_movemask_epi8 (
4059                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4060
4061             unpack_565_128_4x128 (xmm_dst,
4062                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4063             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4064
4065             /* preload next round */
4066             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4067
4068             /* preload next round */
4069             if (pack_cmp != 0xffff)
4070             {
4071                 in_over_2x128 (&xmm_src, &xmm_src,
4072                                &xmm_alpha, &xmm_alpha,
4073                                &xmm_mask_lo, &xmm_mask_hi,
4074                                &xmm_dst0, &xmm_dst1);
4075             }
4076
4077             /* Second round */
4078             pack_cmp = _mm_movemask_epi8 (
4079                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4080
4081             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4082
4083             if (pack_cmp != 0xffff)
4084             {
4085                 in_over_2x128 (&xmm_src, &xmm_src,
4086                                &xmm_alpha, &xmm_alpha,
4087                                &xmm_mask_lo, &xmm_mask_hi,
4088                                &xmm_dst2, &xmm_dst3);
4089             }
4090
4091             save_128_aligned (
4092                 (__m128i*)dst, pack_565_4x128_128 (
4093                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4094
4095             w -= 8;
4096             dst += 8;
4097             mask += 8;
4098         }
4099
4100         while (w)
4101         {
4102             m = *(uint32_t *) mask;
4103
4104             if (m)
4105             {
4106                 d = *dst;
4107                 mmx_mask = unpack_32_1x128 (m);
4108                 mmx_dest = expand565_16_1x128 (d);
4109
4110                 *dst = pack_565_32_16 (
4111                     pack_1x128_32 (
4112                         in_over_1x128 (
4113                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4114             }
4115
4116             w--;
4117             dst++;
4118             mask++;
4119         }
4120     }
4121
4122 }
4123
4124 static void
4125 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4126                          pixman_op_t              op,
4127                          pixman_image_t *         src_image,
4128                          pixman_image_t *         mask_image,
4129                          pixman_image_t *         dst_image,
4130                          int32_t                  src_x,
4131                          int32_t                  src_y,
4132                          int32_t                  mask_x,
4133                          int32_t                  mask_y,
4134                          int32_t                  dest_x,
4135                          int32_t                  dest_y,
4136                          int32_t                  width,
4137                          int32_t                  height)
4138 {
4139     uint8_t     *dst_line, *dst;
4140     uint8_t     *mask_line, *mask;
4141     int dst_stride, mask_stride;
4142     uint32_t d, m;
4143     uint32_t src;
4144     uint8_t sa;
4145     int32_t w;
4146
4147     __m128i xmm_alpha;
4148     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4149     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4150
4151     PIXMAN_IMAGE_GET_LINE (
4152         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4153     PIXMAN_IMAGE_GET_LINE (
4154         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4155
4156     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4157
4158     sa = src >> 24;
4159
4160     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4161
4162     while (height--)
4163     {
4164         dst = dst_line;
4165         dst_line += dst_stride;
4166         mask = mask_line;
4167         mask_line += mask_stride;
4168         w = width;
4169
4170         while (w && ((unsigned long)dst & 15))
4171         {
4172             m = (uint32_t) *mask++;
4173             d = (uint32_t) *dst;
4174
4175             *dst++ = (uint8_t) pack_1x128_32 (
4176                 pix_multiply_1x128 (
4177                     pix_multiply_1x128 (xmm_alpha,
4178                                        unpack_32_1x128 (m)),
4179                     unpack_32_1x128 (d)));
4180             w--;
4181         }
4182
4183         while (w >= 16)
4184         {
4185             xmm_mask = load_128_unaligned ((__m128i*)mask);
4186             xmm_dst = load_128_aligned ((__m128i*)dst);
4187
4188             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4189             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4190
4191             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4192                                 &xmm_mask_lo, &xmm_mask_hi,
4193                                 &xmm_mask_lo, &xmm_mask_hi);
4194
4195             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4196                                 &xmm_dst_lo, &xmm_dst_hi,
4197                                 &xmm_dst_lo, &xmm_dst_hi);
4198
4199             save_128_aligned (
4200                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4201
4202             mask += 16;
4203             dst += 16;
4204             w -= 16;
4205         }
4206
4207         while (w)
4208         {
4209             m = (uint32_t) *mask++;
4210             d = (uint32_t) *dst;
4211
4212             *dst++ = (uint8_t) pack_1x128_32 (
4213                 pix_multiply_1x128 (
4214                     pix_multiply_1x128 (
4215                         xmm_alpha, unpack_32_1x128 (m)),
4216                     unpack_32_1x128 (d)));
4217             w--;
4218         }
4219     }
4220
4221 }
4222
4223 static void
4224 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4225                        pixman_op_t              op,
4226                        pixman_image_t *         src_image,
4227                        pixman_image_t *         mask_image,
4228                        pixman_image_t *         dst_image,
4229                        int32_t                  src_x,
4230                        int32_t                  src_y,
4231                        int32_t                  mask_x,
4232                        int32_t                  mask_y,
4233                        int32_t                  dest_x,
4234                        int32_t                  dest_y,
4235                        int32_t                  width,
4236                        int32_t                  height)
4237 {
4238     uint8_t     *dst_line, *dst;
4239     int dst_stride;
4240     uint32_t d;
4241     uint32_t src;
4242     int32_t w;
4243
4244     __m128i xmm_alpha;
4245     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4246
4247     PIXMAN_IMAGE_GET_LINE (
4248         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4249
4250     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4251
4252     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4253
4254     src = src >> 24;
4255
4256     if (src == 0xff)
4257         return;
4258
4259     if (src == 0x00)
4260     {
4261         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4262                      8, dest_x, dest_y, width, height, src);
4263
4264         return;
4265     }
4266
4267     while (height--)
4268     {
4269         dst = dst_line;
4270         dst_line += dst_stride;
4271         w = width;
4272
4273         while (w && ((unsigned long)dst & 15))
4274         {
4275             d = (uint32_t) *dst;
4276
4277             *dst++ = (uint8_t) pack_1x128_32 (
4278                 pix_multiply_1x128 (
4279                     xmm_alpha,
4280                     unpack_32_1x128 (d)));
4281             w--;
4282         }
4283
4284         while (w >= 16)
4285         {
4286             xmm_dst = load_128_aligned ((__m128i*)dst);
4287
4288             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4289             
4290             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4291                                 &xmm_dst_lo, &xmm_dst_hi,
4292                                 &xmm_dst_lo, &xmm_dst_hi);
4293
4294             save_128_aligned (
4295                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4296
4297             dst += 16;
4298             w -= 16;
4299         }
4300
4301         while (w)
4302         {
4303             d = (uint32_t) *dst;
4304
4305             *dst++ = (uint8_t) pack_1x128_32 (
4306                 pix_multiply_1x128 (
4307                     xmm_alpha,
4308                     unpack_32_1x128 (d)));
4309             w--;
4310         }
4311     }
4312
4313 }
4314
4315 static void
4316 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4317                        pixman_op_t              op,
4318                        pixman_image_t *         src_image,
4319                        pixman_image_t *         mask_image,
4320                        pixman_image_t *         dst_image,
4321                        int32_t                  src_x,
4322                        int32_t                  src_y,
4323                        int32_t                  mask_x,
4324                        int32_t                  mask_y,
4325                        int32_t                  dest_x,
4326                        int32_t                  dest_y,
4327                        int32_t                  width,
4328                        int32_t                  height)
4329 {
4330     uint8_t     *dst_line, *dst;
4331     uint8_t     *src_line, *src;
4332     int src_stride, dst_stride;
4333     int32_t w;
4334     uint32_t s, d;
4335
4336     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4337     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4338
4339     PIXMAN_IMAGE_GET_LINE (
4340         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4341     PIXMAN_IMAGE_GET_LINE (
4342         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4343
4344     while (height--)
4345     {
4346         dst = dst_line;
4347         dst_line += dst_stride;
4348         src = src_line;
4349         src_line += src_stride;
4350         w = width;
4351
4352         while (w && ((unsigned long)dst & 15))
4353         {
4354             s = (uint32_t) *src++;
4355             d = (uint32_t) *dst;
4356
4357             *dst++ = (uint8_t) pack_1x128_32 (
4358                 pix_multiply_1x128 (
4359                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4360             w--;
4361         }
4362
4363         while (w >= 16)
4364         {
4365             xmm_src = load_128_unaligned ((__m128i*)src);
4366             xmm_dst = load_128_aligned ((__m128i*)dst);
4367
4368             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4369             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4370
4371             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4372                                 &xmm_dst_lo, &xmm_dst_hi,
4373                                 &xmm_dst_lo, &xmm_dst_hi);
4374
4375             save_128_aligned (
4376                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4377
4378             src += 16;
4379             dst += 16;
4380             w -= 16;
4381         }
4382
4383         while (w)
4384         {
4385             s = (uint32_t) *src++;
4386             d = (uint32_t) *dst;
4387
4388             *dst++ = (uint8_t) pack_1x128_32 (
4389                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4390             w--;
4391         }
4392     }
4393
4394 }
4395
4396 static void
4397 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4398                           pixman_op_t              op,
4399                           pixman_image_t *         src_image,
4400                           pixman_image_t *         mask_image,
4401                           pixman_image_t *         dst_image,
4402                           int32_t                  src_x,
4403                           int32_t                  src_y,
4404                           int32_t                  mask_x,
4405                           int32_t                  mask_y,
4406                           int32_t                  dest_x,
4407                           int32_t                  dest_y,
4408                           int32_t                  width,
4409                           int32_t                  height)
4410 {
4411     uint8_t     *dst_line, *dst;
4412     uint8_t     *mask_line, *mask;
4413     int dst_stride, mask_stride;
4414     int32_t w;
4415     uint32_t src;
4416     uint8_t sa;
4417     uint32_t m, d;
4418
4419     __m128i xmm_alpha;
4420     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4421     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4422
4423     PIXMAN_IMAGE_GET_LINE (
4424         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4425     PIXMAN_IMAGE_GET_LINE (
4426         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4427
4428     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4429
4430     sa = src >> 24;
4431
4432     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4433
4434     while (height--)
4435     {
4436         dst = dst_line;
4437         dst_line += dst_stride;
4438         mask = mask_line;
4439         mask_line += mask_stride;
4440         w = width;
4441
4442         while (w && ((unsigned long)dst & 15))
4443         {
4444             m = (uint32_t) *mask++;
4445             d = (uint32_t) *dst;
4446
4447             *dst++ = (uint8_t) pack_1x128_32 (
4448                 _mm_adds_epu16 (
4449                     pix_multiply_1x128 (
4450                         xmm_alpha, unpack_32_1x128 (m)),
4451                     unpack_32_1x128 (d)));
4452             w--;
4453         }
4454
4455         while (w >= 16)
4456         {
4457             xmm_mask = load_128_unaligned ((__m128i*)mask);
4458             xmm_dst = load_128_aligned ((__m128i*)dst);
4459
4460             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4461             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4462
4463             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4464                                 &xmm_mask_lo, &xmm_mask_hi,
4465                                 &xmm_mask_lo, &xmm_mask_hi);
4466
4467             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4468             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4469
4470             save_128_aligned (
4471                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4472
4473             mask += 16;
4474             dst += 16;
4475             w -= 16;
4476         }
4477
4478         while (w)
4479         {
4480             m = (uint32_t) *mask++;
4481             d = (uint32_t) *dst;
4482
4483             *dst++ = (uint8_t) pack_1x128_32 (
4484                 _mm_adds_epu16 (
4485                     pix_multiply_1x128 (
4486                         xmm_alpha, unpack_32_1x128 (m)),
4487                     unpack_32_1x128 (d)));
4488
4489             w--;
4490         }
4491     }
4492
4493 }
4494
4495 static void
4496 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4497                         pixman_op_t              op,
4498                         pixman_image_t *         src_image,
4499                         pixman_image_t *         mask_image,
4500                         pixman_image_t *         dst_image,
4501                         int32_t                  src_x,
4502                         int32_t                  src_y,
4503                         int32_t                  mask_x,
4504                         int32_t                  mask_y,
4505                         int32_t                  dest_x,
4506                         int32_t                  dest_y,
4507                         int32_t                  width,
4508                         int32_t                  height)
4509 {
4510     uint8_t     *dst_line, *dst;
4511     int dst_stride;
4512     int32_t w;
4513     uint32_t src;
4514
4515     __m128i xmm_src;
4516
4517     PIXMAN_IMAGE_GET_LINE (
4518         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4519
4520     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4521
4522     src >>= 24;
4523
4524     if (src == 0x00)
4525         return;
4526
4527     if (src == 0xff)
4528     {
4529         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4530                      8, dest_x, dest_y, width, height, 0xff);
4531
4532         return;
4533     }
4534
4535     src = (src << 24) | (src << 16) | (src << 8) | src;
4536     xmm_src = _mm_set_epi32 (src, src, src, src);
4537
4538     while (height--)
4539     {
4540         dst = dst_line;
4541         dst_line += dst_stride;
4542         w = width;
4543
4544         while (w && ((unsigned long)dst & 15))
4545         {
4546             *dst = (uint8_t)_mm_cvtsi128_si32 (
4547                 _mm_adds_epu8 (
4548                     xmm_src,
4549                     _mm_cvtsi32_si128 (*dst)));
4550
4551             w--;
4552             dst++;
4553         }
4554
4555         while (w >= 16)
4556         {
4557             save_128_aligned (
4558                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4559
4560             dst += 16;
4561             w -= 16;
4562         }
4563
4564         while (w)
4565         {
4566             *dst = (uint8_t)_mm_cvtsi128_si32 (
4567                 _mm_adds_epu8 (
4568                     xmm_src,
4569                     _mm_cvtsi32_si128 (*dst)));
4570
4571             w--;
4572             dst++;
4573         }
4574     }
4575
4576 }
4577
4578 static void
4579 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4580                         pixman_op_t              op,
4581                         pixman_image_t *         src_image,
4582                         pixman_image_t *         mask_image,
4583                         pixman_image_t *         dst_image,
4584                         int32_t                  src_x,
4585                         int32_t                  src_y,
4586                         int32_t                  mask_x,
4587                         int32_t                  mask_y,
4588                         int32_t                  dest_x,
4589                         int32_t                  dest_y,
4590                         int32_t                  width,
4591                         int32_t                  height)
4592 {
4593     uint8_t     *dst_line, *dst;
4594     uint8_t     *src_line, *src;
4595     int dst_stride, src_stride;
4596     int32_t w;
4597     uint16_t t;
4598
4599     PIXMAN_IMAGE_GET_LINE (
4600         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4601     PIXMAN_IMAGE_GET_LINE (
4602         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4603
4604     while (height--)
4605     {
4606         dst = dst_line;
4607         src = src_line;
4608
4609         dst_line += dst_stride;
4610         src_line += src_stride;
4611         w = width;
4612
4613         /* Small head */
4614         while (w && (unsigned long)dst & 3)
4615         {
4616             t = (*dst) + (*src++);
4617             *dst++ = t | (0 - (t >> 8));
4618             w--;
4619         }
4620
4621         sse2_combine_add_u (imp, op,
4622                             (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4623
4624         /* Small tail */
4625         dst += w & 0xfffc;
4626         src += w & 0xfffc;
4627
4628         w &= 3;
4629
4630         while (w)
4631         {
4632             t = (*dst) + (*src++);
4633             *dst++ = t | (0 - (t >> 8));
4634             w--;
4635         }
4636     }
4637
4638 }
4639
4640 static void
4641 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4642                               pixman_op_t              op,
4643                               pixman_image_t *         src_image,
4644                               pixman_image_t *         mask_image,
4645                               pixman_image_t *         dst_image,
4646                               int32_t                  src_x,
4647                               int32_t                  src_y,
4648                               int32_t                  mask_x,
4649                               int32_t                  mask_y,
4650                               int32_t                  dest_x,
4651                               int32_t                  dest_y,
4652                               int32_t                  width,
4653                               int32_t                  height)
4654 {
4655     uint32_t    *dst_line, *dst;
4656     uint32_t    *src_line, *src;
4657     int dst_stride, src_stride;
4658
4659     PIXMAN_IMAGE_GET_LINE (
4660         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4661     PIXMAN_IMAGE_GET_LINE (
4662         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4663
4664     while (height--)
4665     {
4666         dst = dst_line;
4667         dst_line += dst_stride;
4668         src = src_line;
4669         src_line += src_stride;
4670
4671         sse2_combine_add_u (imp, op, dst, src, NULL, width);
4672     }
4673
4674 }
4675
4676 static pixman_bool_t
4677 pixman_blt_sse2 (uint32_t *src_bits,
4678                  uint32_t *dst_bits,
4679                  int       src_stride,
4680                  int       dst_stride,
4681                  int       src_bpp,
4682                  int       dst_bpp,
4683                  int       src_x,
4684                  int       src_y,
4685                  int       dst_x,
4686                  int       dst_y,
4687                  int       width,
4688                  int       height)
4689 {
4690     uint8_t *   src_bytes;
4691     uint8_t *   dst_bytes;
4692     int byte_width;
4693
4694     if (src_bpp != dst_bpp)
4695         return FALSE;
4696
4697     if (src_bpp == 16)
4698     {
4699         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4700         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4701         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4702         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4703         byte_width = 2 * width;
4704         src_stride *= 2;
4705         dst_stride *= 2;
4706     }
4707     else if (src_bpp == 32)
4708     {
4709         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4710         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4711         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4712         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4713         byte_width = 4 * width;
4714         src_stride *= 4;
4715         dst_stride *= 4;
4716     }
4717     else
4718     {
4719         return FALSE;
4720     }
4721
4722     while (height--)
4723     {
4724         int w;
4725         uint8_t *s = src_bytes;
4726         uint8_t *d = dst_bytes;
4727         src_bytes += src_stride;
4728         dst_bytes += dst_stride;
4729         w = byte_width;
4730
4731         while (w >= 2 && ((unsigned long)d & 3))
4732         {
4733             *(uint16_t *)d = *(uint16_t *)s;
4734             w -= 2;
4735             s += 2;
4736             d += 2;
4737         }
4738
4739         while (w >= 4 && ((unsigned long)d & 15))
4740         {
4741             *(uint32_t *)d = *(uint32_t *)s;
4742
4743             w -= 4;
4744             s += 4;
4745             d += 4;
4746         }
4747
4748         while (w >= 64)
4749         {
4750             __m128i xmm0, xmm1, xmm2, xmm3;
4751
4752             xmm0 = load_128_unaligned ((__m128i*)(s));
4753             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4754             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4755             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4756
4757             save_128_aligned ((__m128i*)(d),    xmm0);
4758             save_128_aligned ((__m128i*)(d + 16), xmm1);
4759             save_128_aligned ((__m128i*)(d + 32), xmm2);
4760             save_128_aligned ((__m128i*)(d + 48), xmm3);
4761
4762             s += 64;
4763             d += 64;
4764             w -= 64;
4765         }
4766
4767         while (w >= 16)
4768         {
4769             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4770
4771             w -= 16;
4772             d += 16;
4773             s += 16;
4774         }
4775
4776         while (w >= 4)
4777         {
4778             *(uint32_t *)d = *(uint32_t *)s;
4779
4780             w -= 4;
4781             s += 4;
4782             d += 4;
4783         }
4784
4785         if (w >= 2)
4786         {
4787             *(uint16_t *)d = *(uint16_t *)s;
4788             w -= 2;
4789             s += 2;
4790             d += 2;
4791         }
4792     }
4793
4794
4795     return TRUE;
4796 }
4797
4798 static void
4799 sse2_composite_copy_area (pixman_implementation_t *imp,
4800                           pixman_op_t              op,
4801                           pixman_image_t *         src_image,
4802                           pixman_image_t *         mask_image,
4803                           pixman_image_t *         dst_image,
4804                           int32_t                  src_x,
4805                           int32_t                  src_y,
4806                           int32_t                  mask_x,
4807                           int32_t                  mask_y,
4808                           int32_t                  dest_x,
4809                           int32_t                  dest_y,
4810                           int32_t                  width,
4811                           int32_t                  height)
4812 {
4813     pixman_blt_sse2 (src_image->bits.bits,
4814                      dst_image->bits.bits,
4815                      src_image->bits.rowstride,
4816                      dst_image->bits.rowstride,
4817                      PIXMAN_FORMAT_BPP (src_image->bits.format),
4818                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
4819                      src_x, src_y, dest_x, dest_y, width, height);
4820 }
4821
4822 static void
4823 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4824                                  pixman_op_t              op,
4825                                  pixman_image_t *         src_image,
4826                                  pixman_image_t *         mask_image,
4827                                  pixman_image_t *         dst_image,
4828                                  int32_t                  src_x,
4829                                  int32_t                  src_y,
4830                                  int32_t                  mask_x,
4831                                  int32_t                  mask_y,
4832                                  int32_t                  dest_x,
4833                                  int32_t                  dest_y,
4834                                  int32_t                  width,
4835                                  int32_t                  height)
4836 {
4837     uint32_t    *src, *src_line, s;
4838     uint32_t    *dst, *dst_line, d;
4839     uint8_t         *mask, *mask_line;
4840     uint32_t m;
4841     int src_stride, mask_stride, dst_stride;
4842     int32_t w;
4843     __m128i ms;
4844
4845     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4846     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4847     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4848
4849     PIXMAN_IMAGE_GET_LINE (
4850         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4851     PIXMAN_IMAGE_GET_LINE (
4852         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4853     PIXMAN_IMAGE_GET_LINE (
4854         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4855
4856     while (height--)
4857     {
4858         src = src_line;
4859         src_line += src_stride;
4860         dst = dst_line;
4861         dst_line += dst_stride;
4862         mask = mask_line;
4863         mask_line += mask_stride;
4864
4865         w = width;
4866
4867         while (w && (unsigned long)dst & 15)
4868         {
4869             s = 0xff000000 | *src++;
4870             m = (uint32_t) *mask++;
4871             d = *dst;
4872             ms = unpack_32_1x128 (s);
4873
4874             if (m != 0xff)
4875             {
4876                 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4877                 __m128i md = unpack_32_1x128 (d);
4878
4879                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4880             }
4881
4882             *dst++ = pack_1x128_32 (ms);
4883             w--;
4884         }
4885
4886         while (w >= 4)
4887         {
4888             m = *(uint32_t*) mask;
4889             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
4890
4891             if (m == 0xffffffff)
4892             {
4893                 save_128_aligned ((__m128i*)dst, xmm_src);
4894             }
4895             else
4896             {
4897                 xmm_dst = load_128_aligned ((__m128i*)dst);
4898
4899                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4900
4901                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4902                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4903                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4904
4905                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4906
4907                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
4908
4909                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4910             }
4911
4912             src += 4;
4913             dst += 4;
4914             mask += 4;
4915             w -= 4;
4916         }
4917
4918         while (w)
4919         {
4920             m = (uint32_t) *mask++;
4921
4922             if (m)
4923             {
4924                 s = 0xff000000 | *src;
4925
4926                 if (m == 0xff)
4927                 {
4928                     *dst = s;
4929                 }
4930                 else
4931                 {
4932                     __m128i ma, md, ms;
4933
4934                     d = *dst;
4935
4936                     ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4937                     md = unpack_32_1x128 (d);
4938                     ms = unpack_32_1x128 (s);
4939
4940                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4941                 }
4942
4943             }
4944
4945             src++;
4946             dst++;
4947             w--;
4948         }
4949     }
4950
4951 }
4952
4953 static void
4954 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4955                                  pixman_op_t              op,
4956                                  pixman_image_t *         src_image,
4957                                  pixman_image_t *         mask_image,
4958                                  pixman_image_t *         dst_image,
4959                                  int32_t                  src_x,
4960                                  int32_t                  src_y,
4961                                  int32_t                  mask_x,
4962                                  int32_t                  mask_y,
4963                                  int32_t                  dest_x,
4964                                  int32_t                  dest_y,
4965                                  int32_t                  width,
4966                                  int32_t                  height)
4967 {
4968     uint32_t    *src, *src_line, s;
4969     uint32_t    *dst, *dst_line, d;
4970     uint8_t         *mask, *mask_line;
4971     uint32_t m;
4972     int src_stride, mask_stride, dst_stride;
4973     int32_t w;
4974
4975     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4976     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4977     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4978
4979     PIXMAN_IMAGE_GET_LINE (
4980         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4981     PIXMAN_IMAGE_GET_LINE (
4982         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4983     PIXMAN_IMAGE_GET_LINE (
4984         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4985
4986     while (height--)
4987     {
4988         src = src_line;
4989         src_line += src_stride;
4990         dst = dst_line;
4991         dst_line += dst_stride;
4992         mask = mask_line;
4993         mask_line += mask_stride;
4994
4995         w = width;
4996
4997         while (w && (unsigned long)dst & 15)
4998         {
4999             uint32_t sa;
5000
5001             s = *src++;
5002             m = (uint32_t) *mask++;
5003             d = *dst;
5004
5005             sa = s >> 24;
5006
5007             if (m)
5008             {
5009                 if (sa == 0xff && m == 0xff)
5010                 {
5011                     *dst = s;
5012                 }
5013                 else
5014                 {
5015                     __m128i ms, md, ma, msa;
5016
5017                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5018                     ms = unpack_32_1x128 (s);
5019                     md = unpack_32_1x128 (d);
5020
5021                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5022
5023                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5024                 }
5025             }
5026
5027             dst++;
5028             w--;
5029         }
5030
5031         while (w >= 4)
5032         {
5033             m = *(uint32_t *) mask;
5034
5035             if (m)
5036             {
5037                 xmm_src = load_128_unaligned ((__m128i*)src);
5038
5039                 if (m == 0xffffffff && is_opaque (xmm_src))
5040                 {
5041                     save_128_aligned ((__m128i *)dst, xmm_src);
5042                 }
5043                 else
5044                 {
5045                     xmm_dst = load_128_aligned ((__m128i *)dst);
5046
5047                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5048
5049                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5050                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5051                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5052
5053                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5054                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5055
5056                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5057                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5058
5059                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5060                 }
5061             }
5062
5063             src += 4;
5064             dst += 4;
5065             mask += 4;
5066             w -= 4;
5067         }
5068
5069         while (w)
5070         {
5071             uint32_t sa;
5072
5073             s = *src++;
5074             m = (uint32_t) *mask++;
5075             d = *dst;
5076
5077             sa = s >> 24;
5078
5079             if (m)
5080             {
5081                 if (sa == 0xff && m == 0xff)
5082                 {
5083                     *dst = s;
5084                 }
5085                 else
5086                 {
5087                     __m128i ms, md, ma, msa;
5088
5089                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5090                     ms = unpack_32_1x128 (s);
5091                     md = unpack_32_1x128 (d);
5092
5093                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5094
5095                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5096                 }
5097             }
5098
5099             dst++;
5100             w--;
5101         }
5102     }
5103
5104 }
5105
5106 static void
5107 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5108                                     pixman_op_t              op,
5109                                     pixman_image_t *         src_image,
5110                                     pixman_image_t *         mask_image,
5111                                     pixman_image_t *         dst_image,
5112                                     int32_t                  src_x,
5113                                     int32_t                  src_y,
5114                                     int32_t                  mask_x,
5115                                     int32_t                  mask_y,
5116                                     int32_t                  dest_x,
5117                                     int32_t                  dest_y,
5118                                     int32_t                  width,
5119                                     int32_t                  height)
5120 {
5121     uint32_t src;
5122     uint32_t    *dst_line, *dst;
5123     __m128i xmm_src;
5124     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5125     __m128i xmm_dsta_hi, xmm_dsta_lo;
5126     int dst_stride;
5127     int32_t w;
5128
5129     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5130
5131     if (src == 0)
5132         return;
5133
5134     PIXMAN_IMAGE_GET_LINE (
5135         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5136
5137     xmm_src = expand_pixel_32_1x128 (src);
5138
5139     while (height--)
5140     {
5141         dst = dst_line;
5142
5143         dst_line += dst_stride;
5144         w = width;
5145
5146         while (w && (unsigned long)dst & 15)
5147         {
5148             __m128i vd;
5149
5150             vd = unpack_32_1x128 (*dst);
5151
5152             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5153                                               xmm_src));
5154             w--;
5155             dst++;
5156         }
5157
5158         while (w >= 4)
5159         {
5160             __m128i tmp_lo, tmp_hi;
5161
5162             xmm_dst = load_128_aligned ((__m128i*)dst);
5163
5164             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5165             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5166
5167             tmp_lo = xmm_src;
5168             tmp_hi = xmm_src;
5169
5170             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5171                         &xmm_dsta_lo, &xmm_dsta_hi,
5172                         &tmp_lo, &tmp_hi);
5173
5174             save_128_aligned (
5175                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5176
5177             w -= 4;
5178             dst += 4;
5179         }
5180
5181         while (w)
5182         {
5183             __m128i vd;
5184
5185             vd = unpack_32_1x128 (*dst);
5186
5187             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5188                                               xmm_src));
5189             w--;
5190             dst++;
5191         }
5192
5193     }
5194
5195 }
5196
5197 static void
5198 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5199                                     pixman_op_t              op,
5200                                     pixman_image_t *         src_image,
5201                                     pixman_image_t *         mask_image,
5202                                     pixman_image_t *         dst_image,
5203                                     int32_t                  src_x,
5204                                     int32_t                  src_y,
5205                                     int32_t                  mask_x,
5206                                     int32_t                  mask_y,
5207                                     int32_t                  dest_x,
5208                                     int32_t                  dest_y,
5209                                     int32_t                  width,
5210                                     int32_t                  height)
5211 {
5212     uint32_t    *src, *src_line, s;
5213     uint32_t    *dst, *dst_line, d;
5214     uint32_t    *mask, *mask_line;
5215     uint32_t    m;
5216     int src_stride, mask_stride, dst_stride;
5217     int32_t w;
5218
5219     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5220     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5221     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5222
5223     PIXMAN_IMAGE_GET_LINE (
5224         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5225     PIXMAN_IMAGE_GET_LINE (
5226         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5227     PIXMAN_IMAGE_GET_LINE (
5228         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5229
5230     while (height--)
5231     {
5232         src = src_line;
5233         src_line += src_stride;
5234         dst = dst_line;
5235         dst_line += dst_stride;
5236         mask = mask_line;
5237         mask_line += mask_stride;
5238
5239         w = width;
5240
5241         while (w && (unsigned long)dst & 15)
5242         {
5243             uint32_t sa;
5244
5245             s = *src++;
5246             m = (*mask++) >> 24;
5247             d = *dst;
5248
5249             sa = s >> 24;
5250
5251             if (m)
5252             {
5253                 if (sa == 0xff && m == 0xff)
5254                 {
5255                     *dst = s;
5256                 }
5257                 else
5258                 {
5259                     __m128i ms, md, ma, msa;
5260
5261                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5262                     ms = unpack_32_1x128 (s);
5263                     md = unpack_32_1x128 (d);
5264
5265                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5266
5267                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5268                 }
5269             }
5270
5271             dst++;
5272             w--;
5273         }
5274
5275         while (w >= 4)
5276         {
5277             xmm_mask = load_128_unaligned ((__m128i*)mask);
5278
5279             if (!is_transparent (xmm_mask))
5280             {
5281                 xmm_src = load_128_unaligned ((__m128i*)src);
5282
5283                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5284                 {
5285                     save_128_aligned ((__m128i *)dst, xmm_src);
5286                 }
5287                 else
5288                 {
5289                     xmm_dst = load_128_aligned ((__m128i *)dst);
5290
5291                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5292                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5293                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5294
5295                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5296                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5297
5298                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5299                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5300
5301                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5302                 }
5303             }
5304
5305             src += 4;
5306             dst += 4;
5307             mask += 4;
5308             w -= 4;
5309         }
5310
5311         while (w)
5312         {
5313             uint32_t sa;
5314
5315             s = *src++;
5316             m = (*mask++) >> 24;
5317             d = *dst;
5318
5319             sa = s >> 24;
5320
5321             if (m)
5322             {
5323                 if (sa == 0xff && m == 0xff)
5324                 {
5325                     *dst = s;
5326                 }
5327                 else
5328                 {
5329                     __m128i ms, md, ma, msa;
5330
5331                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5332                     ms = unpack_32_1x128 (s);
5333                     md = unpack_32_1x128 (d);
5334
5335                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5336
5337                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5338                 }
5339             }
5340
5341             dst++;
5342             w--;
5343         }
5344     }
5345
5346 }
5347
5348 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5349 static force_inline void
5350 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5351                                              const uint32_t* ps,
5352                                              int32_t         w,
5353                                              pixman_fixed_t  vx,
5354                                              pixman_fixed_t  unit_x,
5355                                              pixman_fixed_t  max_vx,
5356                                              pixman_bool_t   fully_transparent_src)
5357 {
5358     uint32_t s, d;
5359     const uint32_t* pm = NULL;
5360
5361     __m128i xmm_dst_lo, xmm_dst_hi;
5362     __m128i xmm_src_lo, xmm_src_hi;
5363     __m128i xmm_alpha_lo, xmm_alpha_hi;
5364
5365     if (fully_transparent_src)
5366         return;
5367
5368     /* Align dst on a 16-byte boundary */
5369     while (w && ((unsigned long)pd & 15))
5370     {
5371         d = *pd;
5372         s = combine1 (ps + (vx >> 16), pm);
5373         vx += unit_x;
5374
5375         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5376         if (pm)
5377             pm++;
5378         w--;
5379     }
5380
5381     while (w >= 4)
5382     {
5383         __m128i tmp;
5384         uint32_t tmp1, tmp2, tmp3, tmp4;
5385
5386         tmp1 = ps[vx >> 16];
5387         vx += unit_x;
5388         tmp2 = ps[vx >> 16];
5389         vx += unit_x;
5390         tmp3 = ps[vx >> 16];
5391         vx += unit_x;
5392         tmp4 = ps[vx >> 16];
5393         vx += unit_x;
5394
5395         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5396
5397         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5398
5399         if (is_opaque (xmm_src_hi))
5400         {
5401             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5402         }
5403         else if (!is_zero (xmm_src_hi))
5404         {
5405             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5406
5407             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5408             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5409
5410             expand_alpha_2x128 (
5411                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5412
5413             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5414                         &xmm_alpha_lo, &xmm_alpha_hi,
5415                         &xmm_dst_lo, &xmm_dst_hi);
5416
5417             /* rebuid the 4 pixel data and save*/
5418             save_128_aligned ((__m128i*)pd,
5419                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5420         }
5421
5422         w -= 4;
5423         pd += 4;
5424         if (pm)
5425             pm += 4;
5426     }
5427
5428     while (w)
5429     {
5430         d = *pd;
5431         s = combine1 (ps + (vx >> 16), pm);
5432         vx += unit_x;
5433
5434         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5435         if (pm)
5436             pm++;
5437
5438         w--;
5439     }
5440 }
5441
5442 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5443                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5444                        uint32_t, uint32_t, COVER)
5445 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5446                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5447                        uint32_t, uint32_t, NONE)
5448 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5449                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5450                        uint32_t, uint32_t, PAD)
5451
5452 static force_inline void
5453 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5454                                                uint32_t *       dst,
5455                                                const uint32_t * src,
5456                                                int32_t          w,
5457                                                pixman_fixed_t   vx,
5458                                                pixman_fixed_t   unit_x,
5459                                                pixman_fixed_t   max_vx,
5460                                                pixman_bool_t    zero_src)
5461 {
5462     __m128i xmm_mask;
5463     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5464     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5465     __m128i xmm_alpha_lo, xmm_alpha_hi;
5466
5467     if (zero_src || (*mask >> 24) == 0)
5468         return;
5469
5470     xmm_mask = create_mask_16_128 (*mask >> 24);
5471
5472     while (w && (unsigned long)dst & 15)
5473     {
5474         uint32_t s = src[pixman_fixed_to_int (vx)];
5475         vx += unit_x;
5476
5477         if (s)
5478         {
5479             uint32_t d = *dst;
5480
5481             __m128i ms = unpack_32_1x128 (s);
5482             __m128i alpha     = expand_alpha_1x128 (ms);
5483             __m128i dest      = xmm_mask;
5484             __m128i alpha_dst = unpack_32_1x128 (d);
5485
5486             *dst = pack_1x128_32 (
5487                 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5488         }
5489         dst++;
5490         w--;
5491     }
5492
5493     while (w >= 4)
5494     {
5495         uint32_t tmp1, tmp2, tmp3, tmp4;
5496
5497         tmp1 = src[pixman_fixed_to_int (vx)];
5498         vx += unit_x;
5499         tmp2 = src[pixman_fixed_to_int (vx)];
5500         vx += unit_x;
5501         tmp3 = src[pixman_fixed_to_int (vx)];
5502         vx += unit_x;
5503         tmp4 = src[pixman_fixed_to_int (vx)];
5504         vx += unit_x;
5505
5506         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5507
5508         if (!is_zero (xmm_src))
5509         {
5510             xmm_dst = load_128_aligned ((__m128i*)dst);
5511
5512             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5513             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5514             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5515                                 &xmm_alpha_lo, &xmm_alpha_hi);
5516
5517             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5518                            &xmm_alpha_lo, &xmm_alpha_hi,
5519                            &xmm_mask, &xmm_mask,
5520                            &xmm_dst_lo, &xmm_dst_hi);
5521
5522             save_128_aligned (
5523                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5524         }
5525
5526         dst += 4;
5527         w -= 4;
5528     }
5529
5530     while (w)
5531     {
5532         uint32_t s = src[pixman_fixed_to_int (vx)];
5533         vx += unit_x;
5534
5535         if (s)
5536         {
5537             uint32_t d = *dst;
5538
5539             __m128i ms = unpack_32_1x128 (s);
5540             __m128i alpha = expand_alpha_1x128 (ms);
5541             __m128i mask  = xmm_mask;
5542             __m128i dest  = unpack_32_1x128 (d);
5543
5544             *dst = pack_1x128_32 (
5545                 in_over_1x128 (&ms, &alpha, &mask, &dest));
5546         }
5547
5548         dst++;
5549         w--;
5550     }
5551
5552 }
5553
5554 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5555                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5556                               uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5557 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5558                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5559                               uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5560 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5561                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5562                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5563
5564 static const pixman_fast_path_t sse2_fast_paths[] =
5565 {
5566     /* PIXMAN_OP_OVER */
5567     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5568     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5569     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5570     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5571     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5572     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5573     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5574     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5575     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5576     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5577     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5578     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5579     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5580     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5581     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5582     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5583     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5584     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5585     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5586     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5587     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5588     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5589     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5590     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5591     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5592     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5593     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5594     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5595     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5596     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5597     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5598     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5599     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5600     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5601     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5602     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5603     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5604     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5605     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5606     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5607     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5608     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5609     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5610     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5611     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5612     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5613     
5614     /* PIXMAN_OP_OVER_REVERSE */
5615     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5616     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5617
5618     /* PIXMAN_OP_ADD */
5619     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5620     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5621     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5622     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5623     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5624     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5625
5626     /* PIXMAN_OP_SRC */
5627     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5628     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5629     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5630     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5631     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5632     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5633     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5634     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5635     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5636     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5637     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5638     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5639     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5640     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5641
5642     /* PIXMAN_OP_IN */
5643     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5644     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5645     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5646
5647     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5648     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5649     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5650     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5651     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5652     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5653     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5654     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5655     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5656     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5657     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5658     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5659
5660     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
5661     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
5662     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
5663     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
5664
5665     { PIXMAN_OP_NONE },
5666 };
5667
5668 static pixman_bool_t
5669 sse2_blt (pixman_implementation_t *imp,
5670           uint32_t *               src_bits,
5671           uint32_t *               dst_bits,
5672           int                      src_stride,
5673           int                      dst_stride,
5674           int                      src_bpp,
5675           int                      dst_bpp,
5676           int                      src_x,
5677           int                      src_y,
5678           int                      dst_x,
5679           int                      dst_y,
5680           int                      width,
5681           int                      height)
5682 {
5683     if (!pixman_blt_sse2 (
5684             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5685             src_x, src_y, dst_x, dst_y, width, height))
5686
5687     {
5688         return _pixman_implementation_blt (
5689             imp->delegate,
5690             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5691             src_x, src_y, dst_x, dst_y, width, height);
5692     }
5693
5694     return TRUE;
5695 }
5696
5697 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5698 __attribute__((__force_align_arg_pointer__))
5699 #endif
5700 static pixman_bool_t
5701 sse2_fill (pixman_implementation_t *imp,
5702            uint32_t *               bits,
5703            int                      stride,
5704            int                      bpp,
5705            int                      x,
5706            int                      y,
5707            int                      width,
5708            int                      height,
5709            uint32_t xor)
5710 {
5711     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5712     {
5713         return _pixman_implementation_fill (
5714             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5715     }
5716
5717     return TRUE;
5718 }
5719
5720 static uint32_t *
5721 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5722 {
5723     int w = iter->width;
5724     __m128i ff000000 = mask_ff000000;
5725     uint32_t *dst = iter->buffer;
5726     uint32_t *src = (uint32_t *)iter->bits;
5727
5728     iter->bits += iter->stride;
5729
5730     while (w && ((unsigned long)dst) & 0x0f)
5731     {
5732         *dst++ = (*src++) | 0xff000000;
5733         w--;
5734     }
5735
5736     while (w >= 4)
5737     {
5738         save_128_aligned (
5739             (__m128i *)dst, _mm_or_si128 (
5740                 load_128_unaligned ((__m128i *)src), ff000000));
5741
5742         dst += 4;
5743         src += 4;
5744         w -= 4;
5745     }
5746
5747     while (w)
5748     {
5749         *dst++ = (*src++) | 0xff000000;
5750         w--;
5751     }
5752
5753     return iter->buffer;
5754 }
5755
5756 static uint32_t *
5757 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
5758 {
5759     int w = iter->width;
5760     uint32_t *dst = iter->buffer;
5761     uint16_t *src = (uint16_t *)iter->bits;
5762     __m128i ff000000 = mask_ff000000;
5763
5764     iter->bits += iter->stride;
5765
5766     while (w && ((unsigned long)dst) & 0x0f)
5767     {
5768         uint16_t s = *src++;
5769
5770         *dst++ = CONVERT_0565_TO_8888 (s);
5771         w--;
5772     }
5773
5774     while (w >= 8)
5775     {
5776         __m128i lo, hi, s;
5777
5778         s = _mm_loadu_si128 ((__m128i *)src);
5779
5780         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
5781         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
5782
5783         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
5784         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
5785
5786         dst += 8;
5787         src += 8;
5788         w -= 8;
5789     }
5790
5791     while (w)
5792     {
5793         uint16_t s = *src++;
5794
5795         *dst++ = CONVERT_0565_TO_8888 (s);
5796         w--;
5797     }
5798
5799     return iter->buffer;
5800 }
5801
5802 static uint32_t *
5803 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5804 {
5805     int w = iter->width;
5806     uint32_t *dst = iter->buffer;
5807     uint8_t *src = iter->bits;
5808     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5809
5810     iter->bits += iter->stride;
5811
5812     while (w && (((unsigned long)dst) & 15))
5813     {
5814         *dst++ = *(src++) << 24;
5815         w--;
5816     }
5817
5818     while (w >= 16)
5819     {
5820         xmm0 = _mm_loadu_si128((__m128i *)src);
5821
5822         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
5823         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
5824         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
5825         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
5826         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
5827         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
5828
5829         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
5830         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
5831         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
5832         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
5833
5834         dst += 16;
5835         src += 16;
5836         w -= 16;
5837     }
5838
5839     while (w)
5840     {
5841         *dst++ = *(src++) << 24;
5842         w--;
5843     }
5844
5845     return iter->buffer;
5846 }
5847
5848 typedef struct
5849 {
5850     pixman_format_code_t        format;
5851     pixman_iter_get_scanline_t  get_scanline;
5852 } fetcher_info_t;
5853
5854 static const fetcher_info_t fetchers[] =
5855 {
5856     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
5857     { PIXMAN_r5g6b5,            sse2_fetch_r5g6b5 },
5858     { PIXMAN_a8,                sse2_fetch_a8 },
5859     { PIXMAN_null }
5860 };
5861
5862 static void
5863 sse2_src_iter_init (pixman_implementation_t *imp,
5864                     pixman_iter_t *iter,
5865                     pixman_image_t *image,
5866                     int x, int y, int width, int height,
5867                     uint8_t *buffer, iter_flags_t flags)
5868 {
5869 #define FLAGS                                                           \
5870     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
5871
5872     if ((flags & ITER_NARROW)                           &&
5873         (image->common.flags & FLAGS) == FLAGS          &&
5874         x >= 0 && y >= 0                                &&
5875         x + width <= image->bits.width                  &&
5876         y + height <= image->bits.height)
5877     {
5878         const fetcher_info_t *f;
5879
5880         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
5881         {
5882             if (image->common.extended_format_code == f->format)
5883             {
5884                 uint8_t *b = (uint8_t *)image->bits.bits;
5885                 int s = image->bits.rowstride * 4;
5886
5887                 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
5888                 iter->stride = s;
5889                 iter->width = width;
5890                 iter->buffer = (uint32_t *)buffer;
5891
5892                 iter->get_scanline = f->get_scanline;
5893                 return;
5894             }
5895         }
5896     }
5897
5898     _pixman_implementation_src_iter_init (
5899         imp->delegate, iter, image, x, y, width, height, buffer, flags);
5900 }
5901
5902 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5903 __attribute__((__force_align_arg_pointer__))
5904 #endif
5905 pixman_implementation_t *
5906 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
5907 {
5908     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
5909
5910     /* SSE2 constants */
5911     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5912     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5913     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5914     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5915     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5916     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5917     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5918     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5919     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
5920     mask_0080 = create_mask_16_128 (0x0080);
5921     mask_00ff = create_mask_16_128 (0x00ff);
5922     mask_0101 = create_mask_16_128 (0x0101);
5923     mask_ffff = create_mask_16_128 (0xffff);
5924     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5925     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5926
5927     /* Set up function pointers */
5928     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5929     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5930     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5931     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5932     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5933     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5934     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5935     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5936     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5937     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5938
5939     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5940
5941     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5942     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5943     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5944     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5945     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5946     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5947     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5948     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5949     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5950     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5951     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5952
5953     imp->blt = sse2_blt;
5954     imp->fill = sse2_fill;
5955
5956     imp->src_iter_init = sse2_src_iter_init;
5957
5958     return imp;
5959 }