Fill out parts of iters in _pixman_implementation_{src,dest}_iter_init()
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
34 #include <emmintrin.h> /* for SSE2 intrinsics */
35 #include "pixman-private.h"
36 #include "pixman-combine32.h"
37 #include "pixman-fast-path.h"
38
39 static __m128i mask_0080;
40 static __m128i mask_00ff;
41 static __m128i mask_0101;
42 static __m128i mask_ffff;
43 static __m128i mask_ff000000;
44 static __m128i mask_alpha;
45
46 static __m128i mask_565_r;
47 static __m128i mask_565_g1, mask_565_g2;
48 static __m128i mask_565_b;
49 static __m128i mask_red;
50 static __m128i mask_green;
51 static __m128i mask_blue;
52
53 static __m128i mask_565_fix_rb;
54 static __m128i mask_565_fix_g;
55
56 static force_inline __m128i
57 unpack_32_1x128 (uint32_t data)
58 {
59     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
60 }
61
62 static force_inline void
63 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
64 {
65     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
66     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
67 }
68
69 static force_inline __m128i
70 unpack_565_to_8888 (__m128i lo)
71 {
72     __m128i r, g, b, rb, t;
73
74     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
75     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
76     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
77
78     rb = _mm_or_si128 (r, b);
79     t  = _mm_and_si128 (rb, mask_565_fix_rb);
80     t  = _mm_srli_epi32 (t, 5);
81     rb = _mm_or_si128 (rb, t);
82
83     t  = _mm_and_si128 (g, mask_565_fix_g);
84     t  = _mm_srli_epi32 (t, 6);
85     g  = _mm_or_si128 (g, t);
86
87     return _mm_or_si128 (rb, g);
88 }
89
90 static force_inline void
91 unpack_565_128_4x128 (__m128i  data,
92                       __m128i* data0,
93                       __m128i* data1,
94                       __m128i* data2,
95                       __m128i* data3)
96 {
97     __m128i lo, hi;
98
99     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
100     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
101
102     lo = unpack_565_to_8888 (lo);
103     hi = unpack_565_to_8888 (hi);
104
105     unpack_128_2x128 (lo, data0, data1);
106     unpack_128_2x128 (hi, data2, data3);
107 }
108
109 static force_inline uint16_t
110 pack_565_32_16 (uint32_t pixel)
111 {
112     return (uint16_t) (((pixel >> 8) & 0xf800) |
113                        ((pixel >> 5) & 0x07e0) |
114                        ((pixel >> 3) & 0x001f));
115 }
116
117 static force_inline __m128i
118 pack_2x128_128 (__m128i lo, __m128i hi)
119 {
120     return _mm_packus_epi16 (lo, hi);
121 }
122
123 static force_inline __m128i
124 pack_565_2x128_128 (__m128i lo, __m128i hi)
125 {
126     __m128i data;
127     __m128i r, g1, g2, b;
128
129     data = pack_2x128_128 (lo, hi);
130
131     r  = _mm_and_si128 (data, mask_565_r);
132     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
133     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
134     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
135
136     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
137 }
138
139 static force_inline __m128i
140 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
141 {
142     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
143                              pack_565_2x128_128 (*xmm2, *xmm3));
144 }
145
146 static force_inline int
147 is_opaque (__m128i x)
148 {
149     __m128i ffs = _mm_cmpeq_epi8 (x, x);
150
151     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
152 }
153
154 static force_inline int
155 is_zero (__m128i x)
156 {
157     return _mm_movemask_epi8 (
158         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
159 }
160
161 static force_inline int
162 is_transparent (__m128i x)
163 {
164     return (_mm_movemask_epi8 (
165                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
166 }
167
168 static force_inline __m128i
169 expand_pixel_32_1x128 (uint32_t data)
170 {
171     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
172 }
173
174 static force_inline __m128i
175 expand_alpha_1x128 (__m128i data)
176 {
177     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
178                                                      _MM_SHUFFLE (3, 3, 3, 3)),
179                                 _MM_SHUFFLE (3, 3, 3, 3));
180 }
181
182 static force_inline void
183 expand_alpha_2x128 (__m128i  data_lo,
184                     __m128i  data_hi,
185                     __m128i* alpha_lo,
186                     __m128i* alpha_hi)
187 {
188     __m128i lo, hi;
189
190     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
191     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
192
193     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
194     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
195 }
196
197 static force_inline void
198 expand_alpha_rev_2x128 (__m128i  data_lo,
199                         __m128i  data_hi,
200                         __m128i* alpha_lo,
201                         __m128i* alpha_hi)
202 {
203     __m128i lo, hi;
204
205     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
206     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
207     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
208     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
209 }
210
211 static force_inline void
212 pix_multiply_2x128 (__m128i* data_lo,
213                     __m128i* data_hi,
214                     __m128i* alpha_lo,
215                     __m128i* alpha_hi,
216                     __m128i* ret_lo,
217                     __m128i* ret_hi)
218 {
219     __m128i lo, hi;
220
221     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
222     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
223     lo = _mm_adds_epu16 (lo, mask_0080);
224     hi = _mm_adds_epu16 (hi, mask_0080);
225     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
226     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
227 }
228
229 static force_inline void
230 pix_add_multiply_2x128 (__m128i* src_lo,
231                         __m128i* src_hi,
232                         __m128i* alpha_dst_lo,
233                         __m128i* alpha_dst_hi,
234                         __m128i* dst_lo,
235                         __m128i* dst_hi,
236                         __m128i* alpha_src_lo,
237                         __m128i* alpha_src_hi,
238                         __m128i* ret_lo,
239                         __m128i* ret_hi)
240 {
241     __m128i t1_lo, t1_hi;
242     __m128i t2_lo, t2_hi;
243
244     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
245     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
246
247     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
248     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
249 }
250
251 static force_inline void
252 negate_2x128 (__m128i  data_lo,
253               __m128i  data_hi,
254               __m128i* neg_lo,
255               __m128i* neg_hi)
256 {
257     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
258     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
259 }
260
261 static force_inline void
262 invert_colors_2x128 (__m128i  data_lo,
263                      __m128i  data_hi,
264                      __m128i* inv_lo,
265                      __m128i* inv_hi)
266 {
267     __m128i lo, hi;
268
269     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
270     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
271     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
272     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
273 }
274
275 static force_inline void
276 over_2x128 (__m128i* src_lo,
277             __m128i* src_hi,
278             __m128i* alpha_lo,
279             __m128i* alpha_hi,
280             __m128i* dst_lo,
281             __m128i* dst_hi)
282 {
283     __m128i t1, t2;
284
285     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
286
287     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
288
289     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
290     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
291 }
292
293 static force_inline void
294 over_rev_non_pre_2x128 (__m128i  src_lo,
295                         __m128i  src_hi,
296                         __m128i* dst_lo,
297                         __m128i* dst_hi)
298 {
299     __m128i lo, hi;
300     __m128i alpha_lo, alpha_hi;
301
302     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
303
304     lo = _mm_or_si128 (alpha_lo, mask_alpha);
305     hi = _mm_or_si128 (alpha_hi, mask_alpha);
306
307     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
308
309     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
310
311     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
312 }
313
314 static force_inline void
315 in_over_2x128 (__m128i* src_lo,
316                __m128i* src_hi,
317                __m128i* alpha_lo,
318                __m128i* alpha_hi,
319                __m128i* mask_lo,
320                __m128i* mask_hi,
321                __m128i* dst_lo,
322                __m128i* dst_hi)
323 {
324     __m128i s_lo, s_hi;
325     __m128i a_lo, a_hi;
326
327     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
328     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
329
330     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
331 }
332
333 /* load 4 pixels from a 16-byte boundary aligned address */
334 static force_inline __m128i
335 load_128_aligned (__m128i* src)
336 {
337     return _mm_load_si128 (src);
338 }
339
340 /* load 4 pixels from a unaligned address */
341 static force_inline __m128i
342 load_128_unaligned (const __m128i* src)
343 {
344     return _mm_loadu_si128 (src);
345 }
346
347 /* save 4 pixels using Write Combining memory on a 16-byte
348  * boundary aligned address
349  */
350 static force_inline void
351 save_128_write_combining (__m128i* dst,
352                           __m128i  data)
353 {
354     _mm_stream_si128 (dst, data);
355 }
356
357 /* save 4 pixels on a 16-byte boundary aligned address */
358 static force_inline void
359 save_128_aligned (__m128i* dst,
360                   __m128i  data)
361 {
362     _mm_store_si128 (dst, data);
363 }
364
365 /* save 4 pixels on a unaligned address */
366 static force_inline void
367 save_128_unaligned (__m128i* dst,
368                     __m128i  data)
369 {
370     _mm_storeu_si128 (dst, data);
371 }
372
373 static force_inline __m128i
374 load_32_1x128 (uint32_t data)
375 {
376     return _mm_cvtsi32_si128 (data);
377 }
378
379 static force_inline __m128i
380 expand_alpha_rev_1x128 (__m128i data)
381 {
382     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
383 }
384
385 static force_inline __m128i
386 expand_pixel_8_1x128 (uint8_t data)
387 {
388     return _mm_shufflelo_epi16 (
389         unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
390 }
391
392 static force_inline __m128i
393 pix_multiply_1x128 (__m128i data,
394                     __m128i alpha)
395 {
396     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
397                                             mask_0080),
398                             mask_0101);
399 }
400
401 static force_inline __m128i
402 pix_add_multiply_1x128 (__m128i* src,
403                         __m128i* alpha_dst,
404                         __m128i* dst,
405                         __m128i* alpha_src)
406 {
407     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
408     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
409
410     return _mm_adds_epu8 (t1, t2);
411 }
412
413 static force_inline __m128i
414 negate_1x128 (__m128i data)
415 {
416     return _mm_xor_si128 (data, mask_00ff);
417 }
418
419 static force_inline __m128i
420 invert_colors_1x128 (__m128i data)
421 {
422     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
423 }
424
425 static force_inline __m128i
426 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
427 {
428     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
429 }
430
431 static force_inline __m128i
432 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
433 {
434     return over_1x128 (pix_multiply_1x128 (*src, *mask),
435                        pix_multiply_1x128 (*alpha, *mask),
436                        *dst);
437 }
438
439 static force_inline __m128i
440 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
441 {
442     __m128i alpha = expand_alpha_1x128 (src);
443
444     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
445                                            _mm_or_si128 (alpha, mask_alpha)),
446                        alpha,
447                        dst);
448 }
449
450 static force_inline uint32_t
451 pack_1x128_32 (__m128i data)
452 {
453     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
454 }
455
456 static force_inline __m128i
457 expand565_16_1x128 (uint16_t pixel)
458 {
459     __m128i m = _mm_cvtsi32_si128 (pixel);
460
461     m = unpack_565_to_8888 (m);
462
463     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
464 }
465
466 static force_inline uint32_t
467 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
468 {
469     uint8_t a;
470     __m128i xmms;
471
472     a = src >> 24;
473
474     if (a == 0xff)
475     {
476         return src;
477     }
478     else if (src)
479     {
480         xmms = unpack_32_1x128 (src);
481         return pack_1x128_32 (
482             over_1x128 (xmms, expand_alpha_1x128 (xmms),
483                         unpack_32_1x128 (dst)));
484     }
485
486     return dst;
487 }
488
489 static force_inline uint32_t
490 combine1 (const uint32_t *ps, const uint32_t *pm)
491 {
492     uint32_t s = *ps;
493
494     if (pm)
495     {
496         __m128i ms, mm;
497
498         mm = unpack_32_1x128 (*pm);
499         mm = expand_alpha_1x128 (mm);
500
501         ms = unpack_32_1x128 (s);
502         ms = pix_multiply_1x128 (ms, mm);
503
504         s = pack_1x128_32 (ms);
505     }
506
507     return s;
508 }
509
510 static force_inline __m128i
511 combine4 (const __m128i *ps, const __m128i *pm)
512 {
513     __m128i xmm_src_lo, xmm_src_hi;
514     __m128i xmm_msk_lo, xmm_msk_hi;
515     __m128i s;
516
517     if (pm)
518     {
519         xmm_msk_lo = load_128_unaligned (pm);
520
521         if (is_transparent (xmm_msk_lo))
522             return _mm_setzero_si128 ();
523     }
524
525     s = load_128_unaligned (ps);
526
527     if (pm)
528     {
529         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
530         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
531
532         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
533
534         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
535                             &xmm_msk_lo, &xmm_msk_hi,
536                             &xmm_src_lo, &xmm_src_hi);
537
538         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
539     }
540
541     return s;
542 }
543
544 static force_inline void
545 core_combine_over_u_sse2_mask (uint32_t *         pd,
546                                const uint32_t*    ps,
547                                const uint32_t*    pm,
548                                int                w)
549 {
550     uint32_t s, d;
551
552     /* Align dst on a 16-byte boundary */
553     while (w && ((unsigned long)pd & 15))
554     {
555         d = *pd;
556         s = combine1 (ps, pm);
557
558         if (s)
559             *pd = core_combine_over_u_pixel_sse2 (s, d);
560         pd++;
561         ps++;
562         pm++;
563         w--;
564     }
565
566     while (w >= 4)
567     {
568         __m128i mask = load_128_unaligned ((__m128i *)pm);
569
570         if (!is_zero (mask))
571         {
572             __m128i src;
573             __m128i src_hi, src_lo;
574             __m128i mask_hi, mask_lo;
575             __m128i alpha_hi, alpha_lo;
576
577             src = load_128_unaligned ((__m128i *)ps);
578
579             if (is_opaque (_mm_and_si128 (src, mask)))
580             {
581                 save_128_aligned ((__m128i *)pd, src);
582             }
583             else
584             {
585                 __m128i dst = load_128_aligned ((__m128i *)pd);
586                 __m128i dst_hi, dst_lo;
587
588                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
589                 unpack_128_2x128 (src, &src_lo, &src_hi);
590
591                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
592                 pix_multiply_2x128 (&src_lo, &src_hi,
593                                     &mask_lo, &mask_hi,
594                                     &src_lo, &src_hi);
595
596                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
597
598                 expand_alpha_2x128 (src_lo, src_hi,
599                                     &alpha_lo, &alpha_hi);
600
601                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
602                             &dst_lo, &dst_hi);
603
604                 save_128_aligned (
605                     (__m128i *)pd,
606                     pack_2x128_128 (dst_lo, dst_hi));
607             }
608         }
609
610         pm += 4;
611         ps += 4;
612         pd += 4;
613         w -= 4;
614     }
615     while (w)
616     {
617         d = *pd;
618         s = combine1 (ps, pm);
619
620         if (s)
621             *pd = core_combine_over_u_pixel_sse2 (s, d);
622         pd++;
623         ps++;
624         pm++;
625
626         w--;
627     }
628 }
629
630 static force_inline void
631 core_combine_over_u_sse2_no_mask (uint32_t *      pd,
632                                   const uint32_t*    ps,
633                                   int                w)
634 {
635     uint32_t s, d;
636
637     /* Align dst on a 16-byte boundary */
638     while (w && ((unsigned long)pd & 15))
639     {
640         d = *pd;
641         s = *ps;
642
643         if (s)
644             *pd = core_combine_over_u_pixel_sse2 (s, d);
645         pd++;
646         ps++;
647         w--;
648     }
649
650     while (w >= 4)
651     {
652         __m128i src;
653         __m128i src_hi, src_lo, dst_hi, dst_lo;
654         __m128i alpha_hi, alpha_lo;
655
656         src = load_128_unaligned ((__m128i *)ps);
657
658         if (!is_zero (src))
659         {
660             if (is_opaque (src))
661             {
662                 save_128_aligned ((__m128i *)pd, src);
663             }
664             else
665             {
666                 __m128i dst = load_128_aligned ((__m128i *)pd);
667
668                 unpack_128_2x128 (src, &src_lo, &src_hi);
669                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
670
671                 expand_alpha_2x128 (src_lo, src_hi,
672                                     &alpha_lo, &alpha_hi);
673                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
674                             &dst_lo, &dst_hi);
675
676                 save_128_aligned (
677                     (__m128i *)pd,
678                     pack_2x128_128 (dst_lo, dst_hi));
679             }
680         }
681
682         ps += 4;
683         pd += 4;
684         w -= 4;
685     }
686     while (w)
687     {
688         d = *pd;
689         s = *ps;
690
691         if (s)
692             *pd = core_combine_over_u_pixel_sse2 (s, d);
693         pd++;
694         ps++;
695
696         w--;
697     }
698 }
699
700 static force_inline void
701 sse2_combine_over_u (pixman_implementation_t *imp,
702                      pixman_op_t              op,
703                      uint32_t *               pd,
704                      const uint32_t *         ps,
705                      const uint32_t *         pm,
706                      int                      w)
707 {
708     if (pm)
709         core_combine_over_u_sse2_mask (pd, ps, pm, w);
710     else
711         core_combine_over_u_sse2_no_mask (pd, ps, w);
712 }
713
714 static void
715 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
716                              pixman_op_t              op,
717                              uint32_t *               pd,
718                              const uint32_t *         ps,
719                              const uint32_t *         pm,
720                              int                      w)
721 {
722     uint32_t s, d;
723
724     __m128i xmm_dst_lo, xmm_dst_hi;
725     __m128i xmm_src_lo, xmm_src_hi;
726     __m128i xmm_alpha_lo, xmm_alpha_hi;
727
728     /* Align dst on a 16-byte boundary */
729     while (w &&
730            ((unsigned long)pd & 15))
731     {
732         d = *pd;
733         s = combine1 (ps, pm);
734
735         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
736         w--;
737         ps++;
738         if (pm)
739             pm++;
740     }
741
742     while (w >= 4)
743     {
744         /* I'm loading unaligned because I'm not sure
745          * about the address alignment.
746          */
747         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
748         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
749
750         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
751         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
752
753         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
754                             &xmm_alpha_lo, &xmm_alpha_hi);
755
756         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
757                     &xmm_alpha_lo, &xmm_alpha_hi,
758                     &xmm_src_lo, &xmm_src_hi);
759
760         /* rebuid the 4 pixel data and save*/
761         save_128_aligned ((__m128i*)pd,
762                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
763
764         w -= 4;
765         ps += 4;
766         pd += 4;
767
768         if (pm)
769             pm += 4;
770     }
771
772     while (w)
773     {
774         d = *pd;
775         s = combine1 (ps, pm);
776
777         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
778         ps++;
779         w--;
780         if (pm)
781             pm++;
782     }
783 }
784
785 static force_inline uint32_t
786 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
787 {
788     uint32_t maska = src >> 24;
789
790     if (maska == 0)
791     {
792         return 0;
793     }
794     else if (maska != 0xff)
795     {
796         return pack_1x128_32 (
797             pix_multiply_1x128 (unpack_32_1x128 (dst),
798                                 expand_alpha_1x128 (unpack_32_1x128 (src))));
799     }
800
801     return dst;
802 }
803
804 static void
805 sse2_combine_in_u (pixman_implementation_t *imp,
806                    pixman_op_t              op,
807                    uint32_t *               pd,
808                    const uint32_t *         ps,
809                    const uint32_t *         pm,
810                    int                      w)
811 {
812     uint32_t s, d;
813
814     __m128i xmm_src_lo, xmm_src_hi;
815     __m128i xmm_dst_lo, xmm_dst_hi;
816
817     while (w && ((unsigned long) pd & 15))
818     {
819         s = combine1 (ps, pm);
820         d = *pd;
821
822         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
823         w--;
824         ps++;
825         if (pm)
826             pm++;
827     }
828
829     while (w >= 4)
830     {
831         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
832         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
833
834         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
835         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
836
837         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
838         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
839                             &xmm_dst_lo, &xmm_dst_hi,
840                             &xmm_dst_lo, &xmm_dst_hi);
841
842         save_128_aligned ((__m128i*)pd,
843                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
844
845         ps += 4;
846         pd += 4;
847         w -= 4;
848         if (pm)
849             pm += 4;
850     }
851
852     while (w)
853     {
854         s = combine1 (ps, pm);
855         d = *pd;
856
857         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
858         w--;
859         ps++;
860         if (pm)
861             pm++;
862     }
863 }
864
865 static void
866 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
867                            pixman_op_t              op,
868                            uint32_t *               pd,
869                            const uint32_t *         ps,
870                            const uint32_t *         pm,
871                            int                      w)
872 {
873     uint32_t s, d;
874
875     __m128i xmm_src_lo, xmm_src_hi;
876     __m128i xmm_dst_lo, xmm_dst_hi;
877
878     while (w && ((unsigned long) pd & 15))
879     {
880         s = combine1 (ps, pm);
881         d = *pd;
882
883         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
884         ps++;
885         w--;
886         if (pm)
887             pm++;
888     }
889
890     while (w >= 4)
891     {
892         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
893         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
894
895         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
896         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
897
898         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
899         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
900                             &xmm_src_lo, &xmm_src_hi,
901                             &xmm_dst_lo, &xmm_dst_hi);
902
903         save_128_aligned (
904             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
905
906         ps += 4;
907         pd += 4;
908         w -= 4;
909         if (pm)
910             pm += 4;
911     }
912
913     while (w)
914     {
915         s = combine1 (ps, pm);
916         d = *pd;
917
918         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
919         w--;
920         ps++;
921         if (pm)
922             pm++;
923     }
924 }
925
926 static void
927 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
928                             pixman_op_t              op,
929                             uint32_t *               pd,
930                             const uint32_t *         ps,
931                             const uint32_t *         pm,
932                             int                      w)
933 {
934     while (w && ((unsigned long) pd & 15))
935     {
936         uint32_t s = combine1 (ps, pm);
937         uint32_t d = *pd;
938
939         *pd++ = pack_1x128_32 (
940             pix_multiply_1x128 (
941                 unpack_32_1x128 (d), negate_1x128 (
942                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
943
944         if (pm)
945             pm++;
946         ps++;
947         w--;
948     }
949
950     while (w >= 4)
951     {
952         __m128i xmm_src_lo, xmm_src_hi;
953         __m128i xmm_dst_lo, xmm_dst_hi;
954
955         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
956         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
957
958         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
959         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
960
961         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
962         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
963
964         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
965                             &xmm_src_lo, &xmm_src_hi,
966                             &xmm_dst_lo, &xmm_dst_hi);
967
968         save_128_aligned (
969             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
970
971         ps += 4;
972         pd += 4;
973         if (pm)
974             pm += 4;
975
976         w -= 4;
977     }
978
979     while (w)
980     {
981         uint32_t s = combine1 (ps, pm);
982         uint32_t d = *pd;
983
984         *pd++ = pack_1x128_32 (
985             pix_multiply_1x128 (
986                 unpack_32_1x128 (d), negate_1x128 (
987                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
988         ps++;
989         if (pm)
990             pm++;
991         w--;
992     }
993 }
994
995 static void
996 sse2_combine_out_u (pixman_implementation_t *imp,
997                     pixman_op_t              op,
998                     uint32_t *               pd,
999                     const uint32_t *         ps,
1000                     const uint32_t *         pm,
1001                     int                      w)
1002 {
1003     while (w && ((unsigned long) pd & 15))
1004     {
1005         uint32_t s = combine1 (ps, pm);
1006         uint32_t d = *pd;
1007
1008         *pd++ = pack_1x128_32 (
1009             pix_multiply_1x128 (
1010                 unpack_32_1x128 (s), negate_1x128 (
1011                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1012         w--;
1013         ps++;
1014         if (pm)
1015             pm++;
1016     }
1017
1018     while (w >= 4)
1019     {
1020         __m128i xmm_src_lo, xmm_src_hi;
1021         __m128i xmm_dst_lo, xmm_dst_hi;
1022
1023         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1024         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1025
1026         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1027         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1028
1029         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1030         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1031
1032         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1033                             &xmm_dst_lo, &xmm_dst_hi,
1034                             &xmm_dst_lo, &xmm_dst_hi);
1035
1036         save_128_aligned (
1037             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1038
1039         ps += 4;
1040         pd += 4;
1041         w -= 4;
1042         if (pm)
1043             pm += 4;
1044     }
1045
1046     while (w)
1047     {
1048         uint32_t s = combine1 (ps, pm);
1049         uint32_t d = *pd;
1050
1051         *pd++ = pack_1x128_32 (
1052             pix_multiply_1x128 (
1053                 unpack_32_1x128 (s), negate_1x128 (
1054                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1055         w--;
1056         ps++;
1057         if (pm)
1058             pm++;
1059     }
1060 }
1061
1062 static force_inline uint32_t
1063 core_combine_atop_u_pixel_sse2 (uint32_t src,
1064                                 uint32_t dst)
1065 {
1066     __m128i s = unpack_32_1x128 (src);
1067     __m128i d = unpack_32_1x128 (dst);
1068
1069     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1070     __m128i da = expand_alpha_1x128 (d);
1071
1072     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1073 }
1074
1075 static void
1076 sse2_combine_atop_u (pixman_implementation_t *imp,
1077                      pixman_op_t              op,
1078                      uint32_t *               pd,
1079                      const uint32_t *         ps,
1080                      const uint32_t *         pm,
1081                      int                      w)
1082 {
1083     uint32_t s, d;
1084
1085     __m128i xmm_src_lo, xmm_src_hi;
1086     __m128i xmm_dst_lo, xmm_dst_hi;
1087     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1088     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1089
1090     while (w && ((unsigned long) pd & 15))
1091     {
1092         s = combine1 (ps, pm);
1093         d = *pd;
1094
1095         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1096         w--;
1097         ps++;
1098         if (pm)
1099             pm++;
1100     }
1101
1102     while (w >= 4)
1103     {
1104         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1105         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1106
1107         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1108         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1109
1110         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1111                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1112         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1113                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1114
1115         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1116                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1117
1118         pix_add_multiply_2x128 (
1119             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1120             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1121             &xmm_dst_lo, &xmm_dst_hi);
1122
1123         save_128_aligned (
1124             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1125
1126         ps += 4;
1127         pd += 4;
1128         w -= 4;
1129         if (pm)
1130             pm += 4;
1131     }
1132
1133     while (w)
1134     {
1135         s = combine1 (ps, pm);
1136         d = *pd;
1137
1138         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1139         w--;
1140         ps++;
1141         if (pm)
1142             pm++;
1143     }
1144 }
1145
1146 static force_inline uint32_t
1147 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1148                                         uint32_t dst)
1149 {
1150     __m128i s = unpack_32_1x128 (src);
1151     __m128i d = unpack_32_1x128 (dst);
1152
1153     __m128i sa = expand_alpha_1x128 (s);
1154     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1155
1156     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1157 }
1158
1159 static void
1160 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1161                              pixman_op_t              op,
1162                              uint32_t *               pd,
1163                              const uint32_t *         ps,
1164                              const uint32_t *         pm,
1165                              int                      w)
1166 {
1167     uint32_t s, d;
1168
1169     __m128i xmm_src_lo, xmm_src_hi;
1170     __m128i xmm_dst_lo, xmm_dst_hi;
1171     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1172     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1173
1174     while (w && ((unsigned long) pd & 15))
1175     {
1176         s = combine1 (ps, pm);
1177         d = *pd;
1178
1179         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1180         ps++;
1181         w--;
1182         if (pm)
1183             pm++;
1184     }
1185
1186     while (w >= 4)
1187     {
1188         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1189         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1190
1191         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1192         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1193
1194         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1195                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1196         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1197                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1198
1199         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1200                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1201
1202         pix_add_multiply_2x128 (
1203             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1204             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1205             &xmm_dst_lo, &xmm_dst_hi);
1206
1207         save_128_aligned (
1208             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1209
1210         ps += 4;
1211         pd += 4;
1212         w -= 4;
1213         if (pm)
1214             pm += 4;
1215     }
1216
1217     while (w)
1218     {
1219         s = combine1 (ps, pm);
1220         d = *pd;
1221
1222         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1223         ps++;
1224         w--;
1225         if (pm)
1226             pm++;
1227     }
1228 }
1229
1230 static force_inline uint32_t
1231 core_combine_xor_u_pixel_sse2 (uint32_t src,
1232                                uint32_t dst)
1233 {
1234     __m128i s = unpack_32_1x128 (src);
1235     __m128i d = unpack_32_1x128 (dst);
1236
1237     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1238     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1239
1240     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1241 }
1242
1243 static void
1244 sse2_combine_xor_u (pixman_implementation_t *imp,
1245                     pixman_op_t              op,
1246                     uint32_t *               dst,
1247                     const uint32_t *         src,
1248                     const uint32_t *         mask,
1249                     int                      width)
1250 {
1251     int w = width;
1252     uint32_t s, d;
1253     uint32_t* pd = dst;
1254     const uint32_t* ps = src;
1255     const uint32_t* pm = mask;
1256
1257     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1258     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1259     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1260     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1261
1262     while (w && ((unsigned long) pd & 15))
1263     {
1264         s = combine1 (ps, pm);
1265         d = *pd;
1266
1267         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1268         w--;
1269         ps++;
1270         if (pm)
1271             pm++;
1272     }
1273
1274     while (w >= 4)
1275     {
1276         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1277         xmm_dst = load_128_aligned ((__m128i*) pd);
1278
1279         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1280         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1281
1282         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1283                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1284         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1285                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1286
1287         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1288                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1289         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1290                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1291
1292         pix_add_multiply_2x128 (
1293             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1294             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1295             &xmm_dst_lo, &xmm_dst_hi);
1296
1297         save_128_aligned (
1298             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1299
1300         ps += 4;
1301         pd += 4;
1302         w -= 4;
1303         if (pm)
1304             pm += 4;
1305     }
1306
1307     while (w)
1308     {
1309         s = combine1 (ps, pm);
1310         d = *pd;
1311
1312         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1313         w--;
1314         ps++;
1315         if (pm)
1316             pm++;
1317     }
1318 }
1319
1320 static force_inline void
1321 sse2_combine_add_u (pixman_implementation_t *imp,
1322                     pixman_op_t              op,
1323                     uint32_t *               dst,
1324                     const uint32_t *         src,
1325                     const uint32_t *         mask,
1326                     int                      width)
1327 {
1328     int w = width;
1329     uint32_t s, d;
1330     uint32_t* pd = dst;
1331     const uint32_t* ps = src;
1332     const uint32_t* pm = mask;
1333
1334     while (w && (unsigned long)pd & 15)
1335     {
1336         s = combine1 (ps, pm);
1337         d = *pd;
1338
1339         ps++;
1340         if (pm)
1341             pm++;
1342         *pd++ = _mm_cvtsi128_si32 (
1343             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1344         w--;
1345     }
1346
1347     while (w >= 4)
1348     {
1349         __m128i s;
1350
1351         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1352
1353         save_128_aligned (
1354             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1355
1356         pd += 4;
1357         ps += 4;
1358         if (pm)
1359             pm += 4;
1360         w -= 4;
1361     }
1362
1363     while (w--)
1364     {
1365         s = combine1 (ps, pm);
1366         d = *pd;
1367
1368         ps++;
1369         *pd++ = _mm_cvtsi128_si32 (
1370             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1371         if (pm)
1372             pm++;
1373     }
1374 }
1375
1376 static force_inline uint32_t
1377 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1378                                     uint32_t dst)
1379 {
1380     __m128i ms = unpack_32_1x128 (src);
1381     __m128i md = unpack_32_1x128 (dst);
1382     uint32_t sa = src >> 24;
1383     uint32_t da = ~dst >> 24;
1384
1385     if (sa > da)
1386     {
1387         ms = pix_multiply_1x128 (
1388             ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1389     }
1390
1391     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1392 }
1393
1394 static void
1395 sse2_combine_saturate_u (pixman_implementation_t *imp,
1396                          pixman_op_t              op,
1397                          uint32_t *               pd,
1398                          const uint32_t *         ps,
1399                          const uint32_t *         pm,
1400                          int                      w)
1401 {
1402     uint32_t s, d;
1403
1404     uint32_t pack_cmp;
1405     __m128i xmm_src, xmm_dst;
1406
1407     while (w && (unsigned long)pd & 15)
1408     {
1409         s = combine1 (ps, pm);
1410         d = *pd;
1411
1412         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1413         w--;
1414         ps++;
1415         if (pm)
1416             pm++;
1417     }
1418
1419     while (w >= 4)
1420     {
1421         xmm_dst = load_128_aligned  ((__m128i*)pd);
1422         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1423
1424         pack_cmp = _mm_movemask_epi8 (
1425             _mm_cmpgt_epi32 (
1426                 _mm_srli_epi32 (xmm_src, 24),
1427                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1428
1429         /* if some alpha src is grater than respective ~alpha dst */
1430         if (pack_cmp)
1431         {
1432             s = combine1 (ps++, pm);
1433             d = *pd;
1434             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1435             if (pm)
1436                 pm++;
1437
1438             s = combine1 (ps++, pm);
1439             d = *pd;
1440             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1441             if (pm)
1442                 pm++;
1443
1444             s = combine1 (ps++, pm);
1445             d = *pd;
1446             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1447             if (pm)
1448                 pm++;
1449
1450             s = combine1 (ps++, pm);
1451             d = *pd;
1452             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1453             if (pm)
1454                 pm++;
1455         }
1456         else
1457         {
1458             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1459
1460             pd += 4;
1461             ps += 4;
1462             if (pm)
1463                 pm += 4;
1464         }
1465
1466         w -= 4;
1467     }
1468
1469     while (w--)
1470     {
1471         s = combine1 (ps, pm);
1472         d = *pd;
1473
1474         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1475         ps++;
1476         if (pm)
1477             pm++;
1478     }
1479 }
1480
1481 static void
1482 sse2_combine_src_ca (pixman_implementation_t *imp,
1483                      pixman_op_t              op,
1484                      uint32_t *               pd,
1485                      const uint32_t *         ps,
1486                      const uint32_t *         pm,
1487                      int                      w)
1488 {
1489     uint32_t s, m;
1490
1491     __m128i xmm_src_lo, xmm_src_hi;
1492     __m128i xmm_mask_lo, xmm_mask_hi;
1493     __m128i xmm_dst_lo, xmm_dst_hi;
1494
1495     while (w && (unsigned long)pd & 15)
1496     {
1497         s = *ps++;
1498         m = *pm++;
1499         *pd++ = pack_1x128_32 (
1500             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1501         w--;
1502     }
1503
1504     while (w >= 4)
1505     {
1506         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1507         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1508
1509         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1510         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1511
1512         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1513                             &xmm_mask_lo, &xmm_mask_hi,
1514                             &xmm_dst_lo, &xmm_dst_hi);
1515
1516         save_128_aligned (
1517             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1518
1519         ps += 4;
1520         pd += 4;
1521         pm += 4;
1522         w -= 4;
1523     }
1524
1525     while (w)
1526     {
1527         s = *ps++;
1528         m = *pm++;
1529         *pd++ = pack_1x128_32 (
1530             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1531         w--;
1532     }
1533 }
1534
1535 static force_inline uint32_t
1536 core_combine_over_ca_pixel_sse2 (uint32_t src,
1537                                  uint32_t mask,
1538                                  uint32_t dst)
1539 {
1540     __m128i s = unpack_32_1x128 (src);
1541     __m128i expAlpha = expand_alpha_1x128 (s);
1542     __m128i unpk_mask = unpack_32_1x128 (mask);
1543     __m128i unpk_dst  = unpack_32_1x128 (dst);
1544
1545     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1546 }
1547
1548 static void
1549 sse2_combine_over_ca (pixman_implementation_t *imp,
1550                       pixman_op_t              op,
1551                       uint32_t *               pd,
1552                       const uint32_t *         ps,
1553                       const uint32_t *         pm,
1554                       int                      w)
1555 {
1556     uint32_t s, m, d;
1557
1558     __m128i xmm_alpha_lo, xmm_alpha_hi;
1559     __m128i xmm_src_lo, xmm_src_hi;
1560     __m128i xmm_dst_lo, xmm_dst_hi;
1561     __m128i xmm_mask_lo, xmm_mask_hi;
1562
1563     while (w && (unsigned long)pd & 15)
1564     {
1565         s = *ps++;
1566         m = *pm++;
1567         d = *pd;
1568
1569         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1570         w--;
1571     }
1572
1573     while (w >= 4)
1574     {
1575         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1576         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1577         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1578
1579         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1580         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1581         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1582
1583         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1584                             &xmm_alpha_lo, &xmm_alpha_hi);
1585
1586         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1587                        &xmm_alpha_lo, &xmm_alpha_hi,
1588                        &xmm_mask_lo, &xmm_mask_hi,
1589                        &xmm_dst_lo, &xmm_dst_hi);
1590
1591         save_128_aligned (
1592             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1593
1594         ps += 4;
1595         pd += 4;
1596         pm += 4;
1597         w -= 4;
1598     }
1599
1600     while (w)
1601     {
1602         s = *ps++;
1603         m = *pm++;
1604         d = *pd;
1605
1606         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1607         w--;
1608     }
1609 }
1610
1611 static force_inline uint32_t
1612 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1613                                          uint32_t mask,
1614                                          uint32_t dst)
1615 {
1616     __m128i d = unpack_32_1x128 (dst);
1617
1618     return pack_1x128_32 (
1619         over_1x128 (d, expand_alpha_1x128 (d),
1620                     pix_multiply_1x128 (unpack_32_1x128 (src),
1621                                         unpack_32_1x128 (mask))));
1622 }
1623
1624 static void
1625 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1626                               pixman_op_t              op,
1627                               uint32_t *               pd,
1628                               const uint32_t *         ps,
1629                               const uint32_t *         pm,
1630                               int                      w)
1631 {
1632     uint32_t s, m, d;
1633
1634     __m128i xmm_alpha_lo, xmm_alpha_hi;
1635     __m128i xmm_src_lo, xmm_src_hi;
1636     __m128i xmm_dst_lo, xmm_dst_hi;
1637     __m128i xmm_mask_lo, xmm_mask_hi;
1638
1639     while (w && (unsigned long)pd & 15)
1640     {
1641         s = *ps++;
1642         m = *pm++;
1643         d = *pd;
1644
1645         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1646         w--;
1647     }
1648
1649     while (w >= 4)
1650     {
1651         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1652         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1653         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1654
1655         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1656         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1657         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1658
1659         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1660                             &xmm_alpha_lo, &xmm_alpha_hi);
1661         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1662                             &xmm_mask_lo, &xmm_mask_hi,
1663                             &xmm_mask_lo, &xmm_mask_hi);
1664
1665         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1666                     &xmm_alpha_lo, &xmm_alpha_hi,
1667                     &xmm_mask_lo, &xmm_mask_hi);
1668
1669         save_128_aligned (
1670             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1671
1672         ps += 4;
1673         pd += 4;
1674         pm += 4;
1675         w -= 4;
1676     }
1677
1678     while (w)
1679     {
1680         s = *ps++;
1681         m = *pm++;
1682         d = *pd;
1683
1684         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1685         w--;
1686     }
1687 }
1688
1689 static void
1690 sse2_combine_in_ca (pixman_implementation_t *imp,
1691                     pixman_op_t              op,
1692                     uint32_t *               pd,
1693                     const uint32_t *         ps,
1694                     const uint32_t *         pm,
1695                     int                      w)
1696 {
1697     uint32_t s, m, d;
1698
1699     __m128i xmm_alpha_lo, xmm_alpha_hi;
1700     __m128i xmm_src_lo, xmm_src_hi;
1701     __m128i xmm_dst_lo, xmm_dst_hi;
1702     __m128i xmm_mask_lo, xmm_mask_hi;
1703
1704     while (w && (unsigned long)pd & 15)
1705     {
1706         s = *ps++;
1707         m = *pm++;
1708         d = *pd;
1709
1710         *pd++ = pack_1x128_32 (
1711             pix_multiply_1x128 (
1712                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1713                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1714
1715         w--;
1716     }
1717
1718     while (w >= 4)
1719     {
1720         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1721         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1722         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1723
1724         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1725         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1726         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1727
1728         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1729                             &xmm_alpha_lo, &xmm_alpha_hi);
1730
1731         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1732                             &xmm_mask_lo, &xmm_mask_hi,
1733                             &xmm_dst_lo, &xmm_dst_hi);
1734
1735         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1736                             &xmm_alpha_lo, &xmm_alpha_hi,
1737                             &xmm_dst_lo, &xmm_dst_hi);
1738
1739         save_128_aligned (
1740             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1741
1742         ps += 4;
1743         pd += 4;
1744         pm += 4;
1745         w -= 4;
1746     }
1747
1748     while (w)
1749     {
1750         s = *ps++;
1751         m = *pm++;
1752         d = *pd;
1753
1754         *pd++ = pack_1x128_32 (
1755             pix_multiply_1x128 (
1756                 pix_multiply_1x128 (
1757                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1758                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1759
1760         w--;
1761     }
1762 }
1763
1764 static void
1765 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1766                             pixman_op_t              op,
1767                             uint32_t *               pd,
1768                             const uint32_t *         ps,
1769                             const uint32_t *         pm,
1770                             int                      w)
1771 {
1772     uint32_t s, m, d;
1773
1774     __m128i xmm_alpha_lo, xmm_alpha_hi;
1775     __m128i xmm_src_lo, xmm_src_hi;
1776     __m128i xmm_dst_lo, xmm_dst_hi;
1777     __m128i xmm_mask_lo, xmm_mask_hi;
1778
1779     while (w && (unsigned long)pd & 15)
1780     {
1781         s = *ps++;
1782         m = *pm++;
1783         d = *pd;
1784
1785         *pd++ = pack_1x128_32 (
1786             pix_multiply_1x128 (
1787                 unpack_32_1x128 (d),
1788                 pix_multiply_1x128 (unpack_32_1x128 (m),
1789                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1790         w--;
1791     }
1792
1793     while (w >= 4)
1794     {
1795         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1796         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1797         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1798
1799         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1800         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1801         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1802
1803         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1804                             &xmm_alpha_lo, &xmm_alpha_hi);
1805         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1806                             &xmm_alpha_lo, &xmm_alpha_hi,
1807                             &xmm_alpha_lo, &xmm_alpha_hi);
1808
1809         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1810                             &xmm_alpha_lo, &xmm_alpha_hi,
1811                             &xmm_dst_lo, &xmm_dst_hi);
1812
1813         save_128_aligned (
1814             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1815
1816         ps += 4;
1817         pd += 4;
1818         pm += 4;
1819         w -= 4;
1820     }
1821
1822     while (w)
1823     {
1824         s = *ps++;
1825         m = *pm++;
1826         d = *pd;
1827
1828         *pd++ = pack_1x128_32 (
1829             pix_multiply_1x128 (
1830                 unpack_32_1x128 (d),
1831                 pix_multiply_1x128 (unpack_32_1x128 (m),
1832                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1833         w--;
1834     }
1835 }
1836
1837 static void
1838 sse2_combine_out_ca (pixman_implementation_t *imp,
1839                      pixman_op_t              op,
1840                      uint32_t *               pd,
1841                      const uint32_t *         ps,
1842                      const uint32_t *         pm,
1843                      int                      w)
1844 {
1845     uint32_t s, m, d;
1846
1847     __m128i xmm_alpha_lo, xmm_alpha_hi;
1848     __m128i xmm_src_lo, xmm_src_hi;
1849     __m128i xmm_dst_lo, xmm_dst_hi;
1850     __m128i xmm_mask_lo, xmm_mask_hi;
1851
1852     while (w && (unsigned long)pd & 15)
1853     {
1854         s = *ps++;
1855         m = *pm++;
1856         d = *pd;
1857
1858         *pd++ = pack_1x128_32 (
1859             pix_multiply_1x128 (
1860                 pix_multiply_1x128 (
1861                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1862                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1863         w--;
1864     }
1865
1866     while (w >= 4)
1867     {
1868         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1869         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1870         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1871
1872         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1873         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1874         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1875
1876         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1877                             &xmm_alpha_lo, &xmm_alpha_hi);
1878         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1879                       &xmm_alpha_lo, &xmm_alpha_hi);
1880
1881         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1882                             &xmm_mask_lo, &xmm_mask_hi,
1883                             &xmm_dst_lo, &xmm_dst_hi);
1884         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1885                             &xmm_alpha_lo, &xmm_alpha_hi,
1886                             &xmm_dst_lo, &xmm_dst_hi);
1887
1888         save_128_aligned (
1889             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1890
1891         ps += 4;
1892         pd += 4;
1893         pm += 4;
1894         w -= 4;
1895     }
1896
1897     while (w)
1898     {
1899         s = *ps++;
1900         m = *pm++;
1901         d = *pd;
1902
1903         *pd++ = pack_1x128_32 (
1904             pix_multiply_1x128 (
1905                 pix_multiply_1x128 (
1906                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1907                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1908
1909         w--;
1910     }
1911 }
1912
1913 static void
1914 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1915                              pixman_op_t              op,
1916                              uint32_t *               pd,
1917                              const uint32_t *         ps,
1918                              const uint32_t *         pm,
1919                              int                      w)
1920 {
1921     uint32_t s, m, d;
1922
1923     __m128i xmm_alpha_lo, xmm_alpha_hi;
1924     __m128i xmm_src_lo, xmm_src_hi;
1925     __m128i xmm_dst_lo, xmm_dst_hi;
1926     __m128i xmm_mask_lo, xmm_mask_hi;
1927
1928     while (w && (unsigned long)pd & 15)
1929     {
1930         s = *ps++;
1931         m = *pm++;
1932         d = *pd;
1933
1934         *pd++ = pack_1x128_32 (
1935             pix_multiply_1x128 (
1936                 unpack_32_1x128 (d),
1937                 negate_1x128 (pix_multiply_1x128 (
1938                                  unpack_32_1x128 (m),
1939                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1940         w--;
1941     }
1942
1943     while (w >= 4)
1944     {
1945         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1946         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1947         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1948
1949         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1950         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1951         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1952
1953         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1954                             &xmm_alpha_lo, &xmm_alpha_hi);
1955
1956         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1957                             &xmm_alpha_lo, &xmm_alpha_hi,
1958                             &xmm_mask_lo, &xmm_mask_hi);
1959
1960         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1961                       &xmm_mask_lo, &xmm_mask_hi);
1962
1963         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1964                             &xmm_mask_lo, &xmm_mask_hi,
1965                             &xmm_dst_lo, &xmm_dst_hi);
1966
1967         save_128_aligned (
1968             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1969
1970         ps += 4;
1971         pd += 4;
1972         pm += 4;
1973         w -= 4;
1974     }
1975
1976     while (w)
1977     {
1978         s = *ps++;
1979         m = *pm++;
1980         d = *pd;
1981
1982         *pd++ = pack_1x128_32 (
1983             pix_multiply_1x128 (
1984                 unpack_32_1x128 (d),
1985                 negate_1x128 (pix_multiply_1x128 (
1986                                  unpack_32_1x128 (m),
1987                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1988         w--;
1989     }
1990 }
1991
1992 static force_inline uint32_t
1993 core_combine_atop_ca_pixel_sse2 (uint32_t src,
1994                                  uint32_t mask,
1995                                  uint32_t dst)
1996 {
1997     __m128i m = unpack_32_1x128 (mask);
1998     __m128i s = unpack_32_1x128 (src);
1999     __m128i d = unpack_32_1x128 (dst);
2000     __m128i sa = expand_alpha_1x128 (s);
2001     __m128i da = expand_alpha_1x128 (d);
2002
2003     s = pix_multiply_1x128 (s, m);
2004     m = negate_1x128 (pix_multiply_1x128 (m, sa));
2005
2006     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2007 }
2008
2009 static void
2010 sse2_combine_atop_ca (pixman_implementation_t *imp,
2011                       pixman_op_t              op,
2012                       uint32_t *               pd,
2013                       const uint32_t *         ps,
2014                       const uint32_t *         pm,
2015                       int                      w)
2016 {
2017     uint32_t s, m, d;
2018
2019     __m128i xmm_src_lo, xmm_src_hi;
2020     __m128i xmm_dst_lo, xmm_dst_hi;
2021     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2022     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2023     __m128i xmm_mask_lo, xmm_mask_hi;
2024
2025     while (w && (unsigned long)pd & 15)
2026     {
2027         s = *ps++;
2028         m = *pm++;
2029         d = *pd;
2030
2031         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2032         w--;
2033     }
2034
2035     while (w >= 4)
2036     {
2037         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2038         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2039         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2040
2041         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2042         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2043         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2044
2045         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2046                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2047         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2048                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2049
2050         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2051                             &xmm_mask_lo, &xmm_mask_hi,
2052                             &xmm_src_lo, &xmm_src_hi);
2053         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2054                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2055                             &xmm_mask_lo, &xmm_mask_hi);
2056
2057         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2058
2059         pix_add_multiply_2x128 (
2060             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2061             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2062             &xmm_dst_lo, &xmm_dst_hi);
2063
2064         save_128_aligned (
2065             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2066
2067         ps += 4;
2068         pd += 4;
2069         pm += 4;
2070         w -= 4;
2071     }
2072
2073     while (w)
2074     {
2075         s = *ps++;
2076         m = *pm++;
2077         d = *pd;
2078
2079         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2080         w--;
2081     }
2082 }
2083
2084 static force_inline uint32_t
2085 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2086                                          uint32_t mask,
2087                                          uint32_t dst)
2088 {
2089     __m128i m = unpack_32_1x128 (mask);
2090     __m128i s = unpack_32_1x128 (src);
2091     __m128i d = unpack_32_1x128 (dst);
2092
2093     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2094     __m128i sa = expand_alpha_1x128 (s);
2095
2096     s = pix_multiply_1x128 (s, m);
2097     m = pix_multiply_1x128 (m, sa);
2098
2099     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2100 }
2101
2102 static void
2103 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2104                               pixman_op_t              op,
2105                               uint32_t *               pd,
2106                               const uint32_t *         ps,
2107                               const uint32_t *         pm,
2108                               int                      w)
2109 {
2110     uint32_t s, m, d;
2111
2112     __m128i xmm_src_lo, xmm_src_hi;
2113     __m128i xmm_dst_lo, xmm_dst_hi;
2114     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2115     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2116     __m128i xmm_mask_lo, xmm_mask_hi;
2117
2118     while (w && (unsigned long)pd & 15)
2119     {
2120         s = *ps++;
2121         m = *pm++;
2122         d = *pd;
2123
2124         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2125         w--;
2126     }
2127
2128     while (w >= 4)
2129     {
2130         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2131         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2132         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2133
2134         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2135         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2136         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2137
2138         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2139                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2140         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2141                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2142
2143         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2144                             &xmm_mask_lo, &xmm_mask_hi,
2145                             &xmm_src_lo, &xmm_src_hi);
2146         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2147                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2148                             &xmm_mask_lo, &xmm_mask_hi);
2149
2150         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2151                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2152
2153         pix_add_multiply_2x128 (
2154             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2155             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2156             &xmm_dst_lo, &xmm_dst_hi);
2157
2158         save_128_aligned (
2159             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2160
2161         ps += 4;
2162         pd += 4;
2163         pm += 4;
2164         w -= 4;
2165     }
2166
2167     while (w)
2168     {
2169         s = *ps++;
2170         m = *pm++;
2171         d = *pd;
2172
2173         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2174         w--;
2175     }
2176 }
2177
2178 static force_inline uint32_t
2179 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2180                                 uint32_t mask,
2181                                 uint32_t dst)
2182 {
2183     __m128i a = unpack_32_1x128 (mask);
2184     __m128i s = unpack_32_1x128 (src);
2185     __m128i d = unpack_32_1x128 (dst);
2186
2187     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2188                                        a, expand_alpha_1x128 (s)));
2189     __m128i dest      = pix_multiply_1x128 (s, a);
2190     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2191
2192     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2193                                                 &alpha_dst,
2194                                                 &dest,
2195                                                 &alpha_src));
2196 }
2197
2198 static void
2199 sse2_combine_xor_ca (pixman_implementation_t *imp,
2200                      pixman_op_t              op,
2201                      uint32_t *               pd,
2202                      const uint32_t *         ps,
2203                      const uint32_t *         pm,
2204                      int                      w)
2205 {
2206     uint32_t s, m, d;
2207
2208     __m128i xmm_src_lo, xmm_src_hi;
2209     __m128i xmm_dst_lo, xmm_dst_hi;
2210     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2211     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2212     __m128i xmm_mask_lo, xmm_mask_hi;
2213
2214     while (w && (unsigned long)pd & 15)
2215     {
2216         s = *ps++;
2217         m = *pm++;
2218         d = *pd;
2219
2220         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2221         w--;
2222     }
2223
2224     while (w >= 4)
2225     {
2226         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2227         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2228         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2229
2230         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2231         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2232         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2233
2234         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2235                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2236         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2237                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2238
2239         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2240                             &xmm_mask_lo, &xmm_mask_hi,
2241                             &xmm_src_lo, &xmm_src_hi);
2242         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2243                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2244                             &xmm_mask_lo, &xmm_mask_hi);
2245
2246         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2247                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2248         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2249                       &xmm_mask_lo, &xmm_mask_hi);
2250
2251         pix_add_multiply_2x128 (
2252             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2253             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2254             &xmm_dst_lo, &xmm_dst_hi);
2255
2256         save_128_aligned (
2257             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2258
2259         ps += 4;
2260         pd += 4;
2261         pm += 4;
2262         w -= 4;
2263     }
2264
2265     while (w)
2266     {
2267         s = *ps++;
2268         m = *pm++;
2269         d = *pd;
2270
2271         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2272         w--;
2273     }
2274 }
2275
2276 static void
2277 sse2_combine_add_ca (pixman_implementation_t *imp,
2278                      pixman_op_t              op,
2279                      uint32_t *               pd,
2280                      const uint32_t *         ps,
2281                      const uint32_t *         pm,
2282                      int                      w)
2283 {
2284     uint32_t s, m, d;
2285
2286     __m128i xmm_src_lo, xmm_src_hi;
2287     __m128i xmm_dst_lo, xmm_dst_hi;
2288     __m128i xmm_mask_lo, xmm_mask_hi;
2289
2290     while (w && (unsigned long)pd & 15)
2291     {
2292         s = *ps++;
2293         m = *pm++;
2294         d = *pd;
2295
2296         *pd++ = pack_1x128_32 (
2297             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2298                                                unpack_32_1x128 (m)),
2299                            unpack_32_1x128 (d)));
2300         w--;
2301     }
2302
2303     while (w >= 4)
2304     {
2305         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2306         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2307         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2308
2309         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2310         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2311         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2312
2313         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2314                             &xmm_mask_lo, &xmm_mask_hi,
2315                             &xmm_src_lo, &xmm_src_hi);
2316
2317         save_128_aligned (
2318             (__m128i*)pd, pack_2x128_128 (
2319                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2320                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2321
2322         ps += 4;
2323         pd += 4;
2324         pm += 4;
2325         w -= 4;
2326     }
2327
2328     while (w)
2329     {
2330         s = *ps++;
2331         m = *pm++;
2332         d = *pd;
2333
2334         *pd++ = pack_1x128_32 (
2335             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2336                                                unpack_32_1x128 (m)),
2337                            unpack_32_1x128 (d)));
2338         w--;
2339     }
2340 }
2341
2342 static force_inline __m128i
2343 create_mask_16_128 (uint16_t mask)
2344 {
2345     return _mm_set1_epi16 (mask);
2346 }
2347
2348 /* Work around a code generation bug in Sun Studio 12. */
2349 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2350 # define create_mask_2x32_128(mask0, mask1)                             \
2351     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2352 #else
2353 static force_inline __m128i
2354 create_mask_2x32_128 (uint32_t mask0,
2355                       uint32_t mask1)
2356 {
2357     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2358 }
2359 #endif
2360
2361 static void
2362 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2363                             pixman_op_t              op,
2364                             pixman_image_t *         src_image,
2365                             pixman_image_t *         mask_image,
2366                             pixman_image_t *         dst_image,
2367                             int32_t                  src_x,
2368                             int32_t                  src_y,
2369                             int32_t                  mask_x,
2370                             int32_t                  mask_y,
2371                             int32_t                  dest_x,
2372                             int32_t                  dest_y,
2373                             int32_t                  width,
2374                             int32_t                  height)
2375 {
2376     uint32_t src;
2377     uint32_t    *dst_line, *dst, d;
2378     int32_t w;
2379     int dst_stride;
2380     __m128i xmm_src, xmm_alpha;
2381     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2382
2383     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2384
2385     if (src == 0)
2386         return;
2387
2388     PIXMAN_IMAGE_GET_LINE (
2389         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2390
2391     xmm_src = expand_pixel_32_1x128 (src);
2392     xmm_alpha = expand_alpha_1x128 (xmm_src);
2393
2394     while (height--)
2395     {
2396         dst = dst_line;
2397
2398         dst_line += dst_stride;
2399         w = width;
2400
2401         while (w && (unsigned long)dst & 15)
2402         {
2403             d = *dst;
2404             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2405                                                 xmm_alpha,
2406                                                 unpack_32_1x128 (d)));
2407             w--;
2408         }
2409
2410         while (w >= 4)
2411         {
2412             xmm_dst = load_128_aligned ((__m128i*)dst);
2413
2414             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2415
2416             over_2x128 (&xmm_src, &xmm_src,
2417                         &xmm_alpha, &xmm_alpha,
2418                         &xmm_dst_lo, &xmm_dst_hi);
2419
2420             /* rebuid the 4 pixel data and save*/
2421             save_128_aligned (
2422                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2423
2424             w -= 4;
2425             dst += 4;
2426         }
2427
2428         while (w)
2429         {
2430             d = *dst;
2431             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2432                                                 xmm_alpha,
2433                                                 unpack_32_1x128 (d)));
2434             w--;
2435         }
2436
2437     }
2438 }
2439
2440 static void
2441 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2442                             pixman_op_t              op,
2443                             pixman_image_t *         src_image,
2444                             pixman_image_t *         mask_image,
2445                             pixman_image_t *         dst_image,
2446                             int32_t                  src_x,
2447                             int32_t                  src_y,
2448                             int32_t                  mask_x,
2449                             int32_t                  mask_y,
2450                             int32_t                  dest_x,
2451                             int32_t                  dest_y,
2452                             int32_t                  width,
2453                             int32_t                  height)
2454 {
2455     uint32_t src;
2456     uint16_t    *dst_line, *dst, d;
2457     int32_t w;
2458     int dst_stride;
2459     __m128i xmm_src, xmm_alpha;
2460     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2461
2462     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2463
2464     if (src == 0)
2465         return;
2466
2467     PIXMAN_IMAGE_GET_LINE (
2468         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2469
2470     xmm_src = expand_pixel_32_1x128 (src);
2471     xmm_alpha = expand_alpha_1x128 (xmm_src);
2472
2473     while (height--)
2474     {
2475         dst = dst_line;
2476
2477         dst_line += dst_stride;
2478         w = width;
2479
2480         while (w && (unsigned long)dst & 15)
2481         {
2482             d = *dst;
2483
2484             *dst++ = pack_565_32_16 (
2485                 pack_1x128_32 (over_1x128 (xmm_src,
2486                                            xmm_alpha,
2487                                            expand565_16_1x128 (d))));
2488             w--;
2489         }
2490
2491         while (w >= 8)
2492         {
2493             xmm_dst = load_128_aligned ((__m128i*)dst);
2494
2495             unpack_565_128_4x128 (xmm_dst,
2496                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2497
2498             over_2x128 (&xmm_src, &xmm_src,
2499                         &xmm_alpha, &xmm_alpha,
2500                         &xmm_dst0, &xmm_dst1);
2501             over_2x128 (&xmm_src, &xmm_src,
2502                         &xmm_alpha, &xmm_alpha,
2503                         &xmm_dst2, &xmm_dst3);
2504
2505             xmm_dst = pack_565_4x128_128 (
2506                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507
2508             save_128_aligned ((__m128i*)dst, xmm_dst);
2509
2510             dst += 8;
2511             w -= 8;
2512         }
2513
2514         while (w--)
2515         {
2516             d = *dst;
2517             *dst++ = pack_565_32_16 (
2518                 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2519                                            expand565_16_1x128 (d))));
2520         }
2521     }
2522
2523 }
2524
2525 static void
2526 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2527                                    pixman_op_t              op,
2528                                    pixman_image_t *         src_image,
2529                                    pixman_image_t *         mask_image,
2530                                    pixman_image_t *         dst_image,
2531                                    int32_t                  src_x,
2532                                    int32_t                  src_y,
2533                                    int32_t                  mask_x,
2534                                    int32_t                  mask_y,
2535                                    int32_t                  dest_x,
2536                                    int32_t                  dest_y,
2537                                    int32_t                  width,
2538                                    int32_t                  height)
2539 {
2540     uint32_t src, srca;
2541     uint32_t    *dst_line, d;
2542     uint32_t    *mask_line, m;
2543     uint32_t pack_cmp;
2544     int dst_stride, mask_stride;
2545
2546     __m128i xmm_src, xmm_alpha;
2547     __m128i xmm_dst;
2548     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2549
2550     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2551
2552     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2553     srca = src >> 24;
2554
2555     if (src == 0)
2556         return;
2557
2558     PIXMAN_IMAGE_GET_LINE (
2559         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2560     PIXMAN_IMAGE_GET_LINE (
2561         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2562
2563     xmm_src = _mm_unpacklo_epi8 (
2564         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2565     xmm_alpha = expand_alpha_1x128 (xmm_src);
2566     mmx_src   = xmm_src;
2567     mmx_alpha = xmm_alpha;
2568
2569     while (height--)
2570     {
2571         int w = width;
2572         const uint32_t *pm = (uint32_t *)mask_line;
2573         uint32_t *pd = (uint32_t *)dst_line;
2574
2575         dst_line += dst_stride;
2576         mask_line += mask_stride;
2577
2578         while (w && (unsigned long)pd & 15)
2579         {
2580             m = *pm++;
2581
2582             if (m)
2583             {
2584                 d = *pd;
2585
2586                 mmx_mask = unpack_32_1x128 (m);
2587                 mmx_dest = unpack_32_1x128 (d);
2588
2589                 *pd = pack_1x128_32 (
2590                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2591                                    mmx_dest));
2592             }
2593
2594             pd++;
2595             w--;
2596         }
2597
2598         while (w >= 4)
2599         {
2600             xmm_mask = load_128_unaligned ((__m128i*)pm);
2601
2602             pack_cmp =
2603                 _mm_movemask_epi8 (
2604                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2605
2606             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2607             if (pack_cmp != 0xffff)
2608             {
2609                 xmm_dst = load_128_aligned ((__m128i*)pd);
2610
2611                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2612
2613                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2614                                     &xmm_mask_lo, &xmm_mask_hi,
2615                                     &xmm_mask_lo, &xmm_mask_hi);
2616                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2617
2618                 save_128_aligned (
2619                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2620             }
2621
2622             pd += 4;
2623             pm += 4;
2624             w -= 4;
2625         }
2626
2627         while (w)
2628         {
2629             m = *pm++;
2630
2631             if (m)
2632             {
2633                 d = *pd;
2634
2635                 mmx_mask = unpack_32_1x128 (m);
2636                 mmx_dest = unpack_32_1x128 (d);
2637
2638                 *pd = pack_1x128_32 (
2639                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2640                                    mmx_dest));
2641             }
2642
2643             pd++;
2644             w--;
2645         }
2646     }
2647
2648 }
2649
2650 static void
2651 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2652                                     pixman_op_t              op,
2653                                     pixman_image_t *         src_image,
2654                                     pixman_image_t *         mask_image,
2655                                     pixman_image_t *         dst_image,
2656                                     int32_t                  src_x,
2657                                     int32_t                  src_y,
2658                                     int32_t                  mask_x,
2659                                     int32_t                  mask_y,
2660                                     int32_t                  dest_x,
2661                                     int32_t                  dest_y,
2662                                     int32_t                  width,
2663                                     int32_t                  height)
2664 {
2665     uint32_t src;
2666     uint32_t    *dst_line, d;
2667     uint32_t    *mask_line, m;
2668     uint32_t pack_cmp;
2669     int dst_stride, mask_stride;
2670
2671     __m128i xmm_src, xmm_alpha;
2672     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2673     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2674
2675     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2676
2677     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2678
2679     if (src == 0)
2680         return;
2681
2682     PIXMAN_IMAGE_GET_LINE (
2683         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2684     PIXMAN_IMAGE_GET_LINE (
2685         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2686
2687     xmm_src = _mm_unpacklo_epi8 (
2688         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2689     xmm_alpha = expand_alpha_1x128 (xmm_src);
2690     mmx_src   = xmm_src;
2691     mmx_alpha = xmm_alpha;
2692
2693     while (height--)
2694     {
2695         int w = width;
2696         const uint32_t *pm = (uint32_t *)mask_line;
2697         uint32_t *pd = (uint32_t *)dst_line;
2698
2699         dst_line += dst_stride;
2700         mask_line += mask_stride;
2701
2702         while (w && (unsigned long)pd & 15)
2703         {
2704             m = *pm++;
2705
2706             if (m)
2707             {
2708                 d = *pd;
2709                 mmx_mask = unpack_32_1x128 (m);
2710                 mmx_dest = unpack_32_1x128 (d);
2711
2712                 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2713                                                   &mmx_alpha,
2714                                                   &mmx_mask,
2715                                                   &mmx_dest));
2716             }
2717
2718             pd++;
2719             w--;
2720         }
2721
2722         while (w >= 4)
2723         {
2724             xmm_mask = load_128_unaligned ((__m128i*)pm);
2725
2726             pack_cmp =
2727                 _mm_movemask_epi8 (
2728                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2729
2730             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2731             if (pack_cmp != 0xffff)
2732             {
2733                 xmm_dst = load_128_aligned ((__m128i*)pd);
2734
2735                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2736                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2737
2738                 in_over_2x128 (&xmm_src, &xmm_src,
2739                                &xmm_alpha, &xmm_alpha,
2740                                &xmm_mask_lo, &xmm_mask_hi,
2741                                &xmm_dst_lo, &xmm_dst_hi);
2742
2743                 save_128_aligned (
2744                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2745             }
2746
2747             pd += 4;
2748             pm += 4;
2749             w -= 4;
2750         }
2751
2752         while (w)
2753         {
2754             m = *pm++;
2755
2756             if (m)
2757             {
2758                 d = *pd;
2759                 mmx_mask = unpack_32_1x128 (m);
2760                 mmx_dest = unpack_32_1x128 (d);
2761
2762                 *pd = pack_1x128_32 (
2763                     in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2764             }
2765
2766             pd++;
2767             w--;
2768         }
2769     }
2770
2771 }
2772
2773 static void
2774 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2775                                  pixman_op_t              op,
2776                                  pixman_image_t *         src_image,
2777                                  pixman_image_t *         mask_image,
2778                                  pixman_image_t *         dst_image,
2779                                  int32_t                  src_x,
2780                                  int32_t                  src_y,
2781                                  int32_t                  mask_x,
2782                                  int32_t                  mask_y,
2783                                  int32_t                  dest_x,
2784                                  int32_t                  dest_y,
2785                                  int32_t                  width,
2786                                  int32_t                  height)
2787 {
2788     uint32_t    *dst_line, *dst;
2789     uint32_t    *src_line, *src;
2790     uint32_t mask;
2791     int32_t w;
2792     int dst_stride, src_stride;
2793
2794     __m128i xmm_mask;
2795     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2796     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2797     __m128i xmm_alpha_lo, xmm_alpha_hi;
2798
2799     PIXMAN_IMAGE_GET_LINE (
2800         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2801     PIXMAN_IMAGE_GET_LINE (
2802         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2803
2804     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2805
2806     xmm_mask = create_mask_16_128 (mask >> 24);
2807
2808     while (height--)
2809     {
2810         dst = dst_line;
2811         dst_line += dst_stride;
2812         src = src_line;
2813         src_line += src_stride;
2814         w = width;
2815
2816         while (w && (unsigned long)dst & 15)
2817         {
2818             uint32_t s = *src++;
2819
2820             if (s)
2821             {
2822                 uint32_t d = *dst;
2823                 
2824                 __m128i ms = unpack_32_1x128 (s);
2825                 __m128i alpha    = expand_alpha_1x128 (ms);
2826                 __m128i dest     = xmm_mask;
2827                 __m128i alpha_dst = unpack_32_1x128 (d);
2828                 
2829                 *dst = pack_1x128_32 (
2830                     in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2831             }
2832             dst++;
2833             w--;
2834         }
2835
2836         while (w >= 4)
2837         {
2838             xmm_src = load_128_unaligned ((__m128i*)src);
2839
2840             if (!is_zero (xmm_src))
2841             {
2842                 xmm_dst = load_128_aligned ((__m128i*)dst);
2843                 
2844                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2845                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2846                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2847                                     &xmm_alpha_lo, &xmm_alpha_hi);
2848                 
2849                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2850                                &xmm_alpha_lo, &xmm_alpha_hi,
2851                                &xmm_mask, &xmm_mask,
2852                                &xmm_dst_lo, &xmm_dst_hi);
2853                 
2854                 save_128_aligned (
2855                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2856             }
2857                 
2858             dst += 4;
2859             src += 4;
2860             w -= 4;
2861         }
2862
2863         while (w)
2864         {
2865             uint32_t s = *src++;
2866
2867             if (s)
2868             {
2869                 uint32_t d = *dst;
2870                 
2871                 __m128i ms = unpack_32_1x128 (s);
2872                 __m128i alpha = expand_alpha_1x128 (ms);
2873                 __m128i mask  = xmm_mask;
2874                 __m128i dest  = unpack_32_1x128 (d);
2875                 
2876                 *dst = pack_1x128_32 (
2877                     in_over_1x128 (&ms, &alpha, &mask, &dest));
2878             }
2879
2880             dst++;
2881             w--;
2882         }
2883     }
2884
2885 }
2886
2887 static void
2888 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2889                               pixman_op_t              op,
2890                               pixman_image_t *         src_image,
2891                               pixman_image_t *         mask_image,
2892                               pixman_image_t *         dst_image,
2893                               int32_t                  src_x,
2894                               int32_t                  src_y,
2895                               int32_t                  mask_x,
2896                               int32_t                  mask_y,
2897                               int32_t                  dest_x,
2898                               int32_t                  dest_y,
2899                               int32_t                  width,
2900                               int32_t                  height)
2901 {
2902     uint32_t    *dst_line, *dst;
2903     uint32_t    *src_line, *src;
2904     int32_t w;
2905     int dst_stride, src_stride;
2906
2907
2908     PIXMAN_IMAGE_GET_LINE (
2909         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2910     PIXMAN_IMAGE_GET_LINE (
2911         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2912
2913     while (height--)
2914     {
2915         dst = dst_line;
2916         dst_line += dst_stride;
2917         src = src_line;
2918         src_line += src_stride;
2919         w = width;
2920
2921         while (w && (unsigned long)dst & 15)
2922         {
2923             *dst++ = *src++ | 0xff000000;
2924             w--;
2925         }
2926
2927         while (w >= 16)
2928         {
2929             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2930             
2931             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2932             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2933             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2934             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2935             
2936             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2937             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2938             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2939             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2940             
2941             dst += 16;
2942             src += 16;
2943             w -= 16;
2944         }
2945
2946         while (w)
2947         {
2948             *dst++ = *src++ | 0xff000000;
2949             w--;
2950         }
2951     }
2952
2953 }
2954
2955 static void
2956 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2957                                  pixman_op_t              op,
2958                                  pixman_image_t *         src_image,
2959                                  pixman_image_t *         mask_image,
2960                                  pixman_image_t *         dst_image,
2961                                  int32_t                  src_x,
2962                                  int32_t                  src_y,
2963                                  int32_t                  mask_x,
2964                                  int32_t                  mask_y,
2965                                  int32_t                  dest_x,
2966                                  int32_t                  dest_y,
2967                                  int32_t                  width,
2968                                  int32_t                  height)
2969 {
2970     uint32_t    *dst_line, *dst;
2971     uint32_t    *src_line, *src;
2972     uint32_t mask;
2973     int dst_stride, src_stride;
2974     int32_t w;
2975
2976     __m128i xmm_mask, xmm_alpha;
2977     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2978     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2979
2980     PIXMAN_IMAGE_GET_LINE (
2981         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2982     PIXMAN_IMAGE_GET_LINE (
2983         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2984
2985     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2986
2987     xmm_mask = create_mask_16_128 (mask >> 24);
2988     xmm_alpha = mask_00ff;
2989
2990     while (height--)
2991     {
2992         dst = dst_line;
2993         dst_line += dst_stride;
2994         src = src_line;
2995         src_line += src_stride;
2996         w = width;
2997
2998         while (w && (unsigned long)dst & 15)
2999         {
3000             uint32_t s = (*src++) | 0xff000000;
3001             uint32_t d = *dst;
3002
3003             __m128i src   = unpack_32_1x128 (s);
3004             __m128i alpha = xmm_alpha;
3005             __m128i mask  = xmm_mask;
3006             __m128i dest  = unpack_32_1x128 (d);
3007
3008             *dst++ = pack_1x128_32 (
3009                 in_over_1x128 (&src, &alpha, &mask, &dest));
3010
3011             w--;
3012         }
3013
3014         while (w >= 4)
3015         {
3016             xmm_src = _mm_or_si128 (
3017                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3018             xmm_dst = load_128_aligned ((__m128i*)dst);
3019
3020             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3021             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3022
3023             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3024                            &xmm_alpha, &xmm_alpha,
3025                            &xmm_mask, &xmm_mask,
3026                            &xmm_dst_lo, &xmm_dst_hi);
3027
3028             save_128_aligned (
3029                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3030
3031             dst += 4;
3032             src += 4;
3033             w -= 4;
3034
3035         }
3036
3037         while (w)
3038         {
3039             uint32_t s = (*src++) | 0xff000000;
3040             uint32_t d = *dst;
3041
3042             __m128i src  = unpack_32_1x128 (s);
3043             __m128i alpha = xmm_alpha;
3044             __m128i mask  = xmm_mask;
3045             __m128i dest  = unpack_32_1x128 (d);
3046
3047             *dst++ = pack_1x128_32 (
3048                 in_over_1x128 (&src, &alpha, &mask, &dest));
3049
3050             w--;
3051         }
3052     }
3053
3054 }
3055
3056 static void
3057 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3058                                pixman_op_t              op,
3059                                pixman_image_t *         src_image,
3060                                pixman_image_t *         mask_image,
3061                                pixman_image_t *         dst_image,
3062                                int32_t                  src_x,
3063                                int32_t                  src_y,
3064                                int32_t                  mask_x,
3065                                int32_t                  mask_y,
3066                                int32_t                  dest_x,
3067                                int32_t                  dest_y,
3068                                int32_t                  width,
3069                                int32_t                  height)
3070 {
3071     int dst_stride, src_stride;
3072     uint32_t    *dst_line, *dst;
3073     uint32_t    *src_line, *src;
3074
3075     PIXMAN_IMAGE_GET_LINE (
3076         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3077     PIXMAN_IMAGE_GET_LINE (
3078         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3079
3080     dst = dst_line;
3081     src = src_line;
3082
3083     while (height--)
3084     {
3085         sse2_combine_over_u (imp, op, dst, src, NULL, width);
3086
3087         dst += dst_stride;
3088         src += src_stride;
3089     }
3090 }
3091
3092 static force_inline uint16_t
3093 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3094 {
3095     __m128i ms;
3096
3097     ms = unpack_32_1x128 (src);
3098     return pack_565_32_16 (
3099         pack_1x128_32 (
3100             over_1x128 (
3101                 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3102 }
3103
3104 static void
3105 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3106                                pixman_op_t              op,
3107                                pixman_image_t *         src_image,
3108                                pixman_image_t *         mask_image,
3109                                pixman_image_t *         dst_image,
3110                                int32_t                  src_x,
3111                                int32_t                  src_y,
3112                                int32_t                  mask_x,
3113                                int32_t                  mask_y,
3114                                int32_t                  dest_x,
3115                                int32_t                  dest_y,
3116                                int32_t                  width,
3117                                int32_t                  height)
3118 {
3119     uint16_t    *dst_line, *dst, d;
3120     uint32_t    *src_line, *src, s;
3121     int dst_stride, src_stride;
3122     int32_t w;
3123
3124     __m128i xmm_alpha_lo, xmm_alpha_hi;
3125     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3126     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3127
3128     PIXMAN_IMAGE_GET_LINE (
3129         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3130     PIXMAN_IMAGE_GET_LINE (
3131         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3132
3133     while (height--)
3134     {
3135         dst = dst_line;
3136         src = src_line;
3137
3138         dst_line += dst_stride;
3139         src_line += src_stride;
3140         w = width;
3141
3142         /* Align dst on a 16-byte boundary */
3143         while (w &&
3144                ((unsigned long)dst & 15))
3145         {
3146             s = *src++;
3147             d = *dst;
3148
3149             *dst++ = composite_over_8888_0565pixel (s, d);
3150             w--;
3151         }
3152
3153         /* It's a 8 pixel loop */
3154         while (w >= 8)
3155         {
3156             /* I'm loading unaligned because I'm not sure
3157              * about the address alignment.
3158              */
3159             xmm_src = load_128_unaligned ((__m128i*) src);
3160             xmm_dst = load_128_aligned ((__m128i*) dst);
3161
3162             /* Unpacking */
3163             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3164             unpack_565_128_4x128 (xmm_dst,
3165                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3166             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3167                                 &xmm_alpha_lo, &xmm_alpha_hi);
3168
3169             /* I'm loading next 4 pixels from memory
3170              * before to optimze the memory read.
3171              */
3172             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3173
3174             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3175                         &xmm_alpha_lo, &xmm_alpha_hi,
3176                         &xmm_dst0, &xmm_dst1);
3177
3178             /* Unpacking */
3179             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3180             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3181                                 &xmm_alpha_lo, &xmm_alpha_hi);
3182
3183             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3184                         &xmm_alpha_lo, &xmm_alpha_hi,
3185                         &xmm_dst2, &xmm_dst3);
3186
3187             save_128_aligned (
3188                 (__m128i*)dst, pack_565_4x128_128 (
3189                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3190
3191             w -= 8;
3192             dst += 8;
3193             src += 8;
3194         }
3195
3196         while (w--)
3197         {
3198             s = *src++;
3199             d = *dst;
3200
3201             *dst++ = composite_over_8888_0565pixel (s, d);
3202         }
3203     }
3204
3205 }
3206
3207 static void
3208 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3209                               pixman_op_t              op,
3210                               pixman_image_t *         src_image,
3211                               pixman_image_t *         mask_image,
3212                               pixman_image_t *         dst_image,
3213                               int32_t                  src_x,
3214                               int32_t                  src_y,
3215                               int32_t                  mask_x,
3216                               int32_t                  mask_y,
3217                               int32_t                  dest_x,
3218                               int32_t                  dest_y,
3219                               int32_t                  width,
3220                               int32_t                  height)
3221 {
3222     uint32_t src, srca;
3223     uint32_t *dst_line, *dst;
3224     uint8_t *mask_line, *mask;
3225     int dst_stride, mask_stride;
3226     int32_t w;
3227     uint32_t m, d;
3228
3229     __m128i xmm_src, xmm_alpha, xmm_def;
3230     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3231     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3232
3233     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3234
3235     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3236
3237     srca = src >> 24;
3238     if (src == 0)
3239         return;
3240
3241     PIXMAN_IMAGE_GET_LINE (
3242         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3243     PIXMAN_IMAGE_GET_LINE (
3244         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3245
3246     xmm_def = create_mask_2x32_128 (src, src);
3247     xmm_src = expand_pixel_32_1x128 (src);
3248     xmm_alpha = expand_alpha_1x128 (xmm_src);
3249     mmx_src   = xmm_src;
3250     mmx_alpha = xmm_alpha;
3251
3252     while (height--)
3253     {
3254         dst = dst_line;
3255         dst_line += dst_stride;
3256         mask = mask_line;
3257         mask_line += mask_stride;
3258         w = width;
3259
3260         while (w && (unsigned long)dst & 15)
3261         {
3262             uint8_t m = *mask++;
3263
3264             if (m)
3265             {
3266                 d = *dst;
3267                 mmx_mask = expand_pixel_8_1x128 (m);
3268                 mmx_dest = unpack_32_1x128 (d);
3269
3270                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3271                                                    &mmx_alpha,
3272                                                    &mmx_mask,
3273                                                    &mmx_dest));
3274             }
3275
3276             w--;
3277             dst++;
3278         }
3279
3280         while (w >= 4)
3281         {
3282             m = *((uint32_t*)mask);
3283
3284             if (srca == 0xff && m == 0xffffffff)
3285             {
3286                 save_128_aligned ((__m128i*)dst, xmm_def);
3287             }
3288             else if (m)
3289             {
3290                 xmm_dst = load_128_aligned ((__m128i*) dst);
3291                 xmm_mask = unpack_32_1x128 (m);
3292                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3293
3294                 /* Unpacking */
3295                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3296                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3297
3298                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3299                                         &xmm_mask_lo, &xmm_mask_hi);
3300
3301                 in_over_2x128 (&xmm_src, &xmm_src,
3302                                &xmm_alpha, &xmm_alpha,
3303                                &xmm_mask_lo, &xmm_mask_hi,
3304                                &xmm_dst_lo, &xmm_dst_hi);
3305
3306                 save_128_aligned (
3307                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3308             }
3309
3310             w -= 4;
3311             dst += 4;
3312             mask += 4;
3313         }
3314
3315         while (w)
3316         {
3317             uint8_t m = *mask++;
3318
3319             if (m)
3320             {
3321                 d = *dst;
3322                 mmx_mask = expand_pixel_8_1x128 (m);
3323                 mmx_dest = unpack_32_1x128 (d);
3324
3325                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3326                                                    &mmx_alpha,
3327                                                    &mmx_mask,
3328                                                    &mmx_dest));
3329             }
3330
3331             w--;
3332             dst++;
3333         }
3334     }
3335
3336 }
3337
3338 static pixman_bool_t
3339 pixman_fill_sse2 (uint32_t *bits,
3340                   int       stride,
3341                   int       bpp,
3342                   int       x,
3343                   int       y,
3344                   int       width,
3345                   int       height,
3346                   uint32_t  data)
3347 {
3348     uint32_t byte_width;
3349     uint8_t         *byte_line;
3350
3351     __m128i xmm_def;
3352
3353     if (bpp == 8)
3354     {
3355         uint8_t b;
3356         uint16_t w;
3357
3358         stride = stride * (int) sizeof (uint32_t) / 1;
3359         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3360         byte_width = width;
3361         stride *= 1;
3362
3363         b = data & 0xff;
3364         w = (b << 8) | b;
3365         data = (w << 16) | w;
3366     }
3367     else if (bpp == 16)
3368     {
3369         stride = stride * (int) sizeof (uint32_t) / 2;
3370         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3371         byte_width = 2 * width;
3372         stride *= 2;
3373
3374         data = (data & 0xffff) * 0x00010001;
3375     }
3376     else if (bpp == 32)
3377     {
3378         stride = stride * (int) sizeof (uint32_t) / 4;
3379         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3380         byte_width = 4 * width;
3381         stride *= 4;
3382     }
3383     else
3384     {
3385         return FALSE;
3386     }
3387
3388     xmm_def = create_mask_2x32_128 (data, data);
3389
3390     while (height--)
3391     {
3392         int w;
3393         uint8_t *d = byte_line;
3394         byte_line += stride;
3395         w = byte_width;
3396
3397         while (w >= 1 && ((unsigned long)d & 1))
3398         {
3399             *(uint8_t *)d = data;
3400             w -= 1;
3401             d += 1;
3402         }
3403
3404         while (w >= 2 && ((unsigned long)d & 3))
3405         {
3406             *(uint16_t *)d = data;
3407             w -= 2;
3408             d += 2;
3409         }
3410
3411         while (w >= 4 && ((unsigned long)d & 15))
3412         {
3413             *(uint32_t *)d = data;
3414
3415             w -= 4;
3416             d += 4;
3417         }
3418
3419         while (w >= 128)
3420         {
3421             save_128_aligned ((__m128i*)(d),     xmm_def);
3422             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3423             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3424             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3425             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3426             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3427             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3428             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3429
3430             d += 128;
3431             w -= 128;
3432         }
3433
3434         if (w >= 64)
3435         {
3436             save_128_aligned ((__m128i*)(d),     xmm_def);
3437             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3438             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3439             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3440
3441             d += 64;
3442             w -= 64;
3443         }
3444
3445         if (w >= 32)
3446         {
3447             save_128_aligned ((__m128i*)(d),     xmm_def);
3448             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3449
3450             d += 32;
3451             w -= 32;
3452         }
3453
3454         if (w >= 16)
3455         {
3456             save_128_aligned ((__m128i*)(d),     xmm_def);
3457
3458             d += 16;
3459             w -= 16;
3460         }
3461
3462         while (w >= 4)
3463         {
3464             *(uint32_t *)d = data;
3465
3466             w -= 4;
3467             d += 4;
3468         }
3469
3470         if (w >= 2)
3471         {
3472             *(uint16_t *)d = data;
3473             w -= 2;
3474             d += 2;
3475         }
3476
3477         if (w >= 1)
3478         {
3479             *(uint8_t *)d = data;
3480             w -= 1;
3481             d += 1;
3482         }
3483     }
3484
3485     return TRUE;
3486 }
3487
3488 static void
3489 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3490                              pixman_op_t              op,
3491                              pixman_image_t *         src_image,
3492                              pixman_image_t *         mask_image,
3493                              pixman_image_t *         dst_image,
3494                              int32_t                  src_x,
3495                              int32_t                  src_y,
3496                              int32_t                  mask_x,
3497                              int32_t                  mask_y,
3498                              int32_t                  dest_x,
3499                              int32_t                  dest_y,
3500                              int32_t                  width,
3501                              int32_t                  height)
3502 {
3503     uint32_t src, srca;
3504     uint32_t    *dst_line, *dst;
3505     uint8_t     *mask_line, *mask;
3506     int dst_stride, mask_stride;
3507     int32_t w;
3508     uint32_t m;
3509
3510     __m128i xmm_src, xmm_def;
3511     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3512
3513     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3514
3515     srca = src >> 24;
3516     if (src == 0)
3517     {
3518         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3519                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
3520                           dest_x, dest_y, width, height, 0);
3521         return;
3522     }
3523
3524     PIXMAN_IMAGE_GET_LINE (
3525         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3526     PIXMAN_IMAGE_GET_LINE (
3527         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3528
3529     xmm_def = create_mask_2x32_128 (src, src);
3530     xmm_src = expand_pixel_32_1x128 (src);
3531
3532     while (height--)
3533     {
3534         dst = dst_line;
3535         dst_line += dst_stride;
3536         mask = mask_line;
3537         mask_line += mask_stride;
3538         w = width;
3539
3540         while (w && (unsigned long)dst & 15)
3541         {
3542             uint8_t m = *mask++;
3543
3544             if (m)
3545             {
3546                 *dst = pack_1x128_32 (
3547                     pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3548             }
3549             else
3550             {
3551                 *dst = 0;
3552             }
3553
3554             w--;
3555             dst++;
3556         }
3557
3558         while (w >= 4)
3559         {
3560             m = *((uint32_t*)mask);
3561
3562             if (srca == 0xff && m == 0xffffffff)
3563             {
3564                 save_128_aligned ((__m128i*)dst, xmm_def);
3565             }
3566             else if (m)
3567             {
3568                 xmm_mask = unpack_32_1x128 (m);
3569                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3570
3571                 /* Unpacking */
3572                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3573
3574                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3575                                         &xmm_mask_lo, &xmm_mask_hi);
3576
3577                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3578                                     &xmm_mask_lo, &xmm_mask_hi,
3579                                     &xmm_mask_lo, &xmm_mask_hi);
3580
3581                 save_128_aligned (
3582                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3583             }
3584             else
3585             {
3586                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3587             }
3588
3589             w -= 4;
3590             dst += 4;
3591             mask += 4;
3592         }
3593
3594         while (w)
3595         {
3596             uint8_t m = *mask++;
3597
3598             if (m)
3599             {
3600                 *dst = pack_1x128_32 (
3601                     pix_multiply_1x128 (
3602                         xmm_src, expand_pixel_8_1x128 (m)));
3603             }
3604             else
3605             {
3606                 *dst = 0;
3607             }
3608
3609             w--;
3610             dst++;
3611         }
3612     }
3613
3614 }
3615
3616 static void
3617 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3618                               pixman_op_t              op,
3619                               pixman_image_t *         src_image,
3620                               pixman_image_t *         mask_image,
3621                               pixman_image_t *         dst_image,
3622                               int32_t                  src_x,
3623                               int32_t                  src_y,
3624                               int32_t                  mask_x,
3625                               int32_t                  mask_y,
3626                               int32_t                  dest_x,
3627                               int32_t                  dest_y,
3628                               int32_t                  width,
3629                               int32_t                  height)
3630 {
3631     uint32_t src, srca;
3632     uint16_t    *dst_line, *dst, d;
3633     uint8_t     *mask_line, *mask;
3634     int dst_stride, mask_stride;
3635     int32_t w;
3636     uint32_t m;
3637     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3638
3639     __m128i xmm_src, xmm_alpha;
3640     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3641     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3642
3643     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3644
3645     srca = src >> 24;
3646     if (src == 0)
3647         return;
3648
3649     PIXMAN_IMAGE_GET_LINE (
3650         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3651     PIXMAN_IMAGE_GET_LINE (
3652         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3653
3654     xmm_src = expand_pixel_32_1x128 (src);
3655     xmm_alpha = expand_alpha_1x128 (xmm_src);
3656     mmx_src = xmm_src;
3657     mmx_alpha = xmm_alpha;
3658
3659     while (height--)
3660     {
3661         dst = dst_line;
3662         dst_line += dst_stride;
3663         mask = mask_line;
3664         mask_line += mask_stride;
3665         w = width;
3666
3667         while (w && (unsigned long)dst & 15)
3668         {
3669             m = *mask++;
3670
3671             if (m)
3672             {
3673                 d = *dst;
3674                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3675                 mmx_dest = expand565_16_1x128 (d);
3676
3677                 *dst = pack_565_32_16 (
3678                     pack_1x128_32 (
3679                         in_over_1x128 (
3680                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3681             }
3682
3683             w--;
3684             dst++;
3685         }
3686
3687         while (w >= 8)
3688         {
3689             xmm_dst = load_128_aligned ((__m128i*) dst);
3690             unpack_565_128_4x128 (xmm_dst,
3691                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3692
3693             m = *((uint32_t*)mask);
3694             mask += 4;
3695
3696             if (m)
3697             {
3698                 xmm_mask = unpack_32_1x128 (m);
3699                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3700
3701                 /* Unpacking */
3702                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3703
3704                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3705                                         &xmm_mask_lo, &xmm_mask_hi);
3706
3707                 in_over_2x128 (&xmm_src, &xmm_src,
3708                                &xmm_alpha, &xmm_alpha,
3709                                &xmm_mask_lo, &xmm_mask_hi,
3710                                &xmm_dst0, &xmm_dst1);
3711             }
3712
3713             m = *((uint32_t*)mask);
3714             mask += 4;
3715
3716             if (m)
3717             {
3718                 xmm_mask = unpack_32_1x128 (m);
3719                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3720
3721                 /* Unpacking */
3722                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3723
3724                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3725                                         &xmm_mask_lo, &xmm_mask_hi);
3726                 in_over_2x128 (&xmm_src, &xmm_src,
3727                                &xmm_alpha, &xmm_alpha,
3728                                &xmm_mask_lo, &xmm_mask_hi,
3729                                &xmm_dst2, &xmm_dst3);
3730             }
3731
3732             save_128_aligned (
3733                 (__m128i*)dst, pack_565_4x128_128 (
3734                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3735
3736             w -= 8;
3737             dst += 8;
3738         }
3739
3740         while (w)
3741         {
3742             m = *mask++;
3743
3744             if (m)
3745             {
3746                 d = *dst;
3747                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3748                 mmx_dest = expand565_16_1x128 (d);
3749
3750                 *dst = pack_565_32_16 (
3751                     pack_1x128_32 (
3752                         in_over_1x128 (
3753                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3754             }
3755
3756             w--;
3757             dst++;
3758         }
3759     }
3760
3761 }
3762
3763 static void
3764 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3765                                  pixman_op_t              op,
3766                                  pixman_image_t *         src_image,
3767                                  pixman_image_t *         mask_image,
3768                                  pixman_image_t *         dst_image,
3769                                  int32_t                  src_x,
3770                                  int32_t                  src_y,
3771                                  int32_t                  mask_x,
3772                                  int32_t                  mask_y,
3773                                  int32_t                  dest_x,
3774                                  int32_t                  dest_y,
3775                                  int32_t                  width,
3776                                  int32_t                  height)
3777 {
3778     uint16_t    *dst_line, *dst, d;
3779     uint32_t    *src_line, *src, s;
3780     int dst_stride, src_stride;
3781     int32_t w;
3782     uint32_t opaque, zero;
3783
3784     __m128i ms;
3785     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3786     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3787
3788     PIXMAN_IMAGE_GET_LINE (
3789         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3790     PIXMAN_IMAGE_GET_LINE (
3791         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3792
3793     while (height--)
3794     {
3795         dst = dst_line;
3796         dst_line += dst_stride;
3797         src = src_line;
3798         src_line += src_stride;
3799         w = width;
3800
3801         while (w && (unsigned long)dst & 15)
3802         {
3803             s = *src++;
3804             d = *dst;
3805
3806             ms = unpack_32_1x128 (s);
3807
3808             *dst++ = pack_565_32_16 (
3809                 pack_1x128_32 (
3810                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3811             w--;
3812         }
3813
3814         while (w >= 8)
3815         {
3816             /* First round */
3817             xmm_src = load_128_unaligned ((__m128i*)src);
3818             xmm_dst = load_128_aligned  ((__m128i*)dst);
3819
3820             opaque = is_opaque (xmm_src);
3821             zero = is_zero (xmm_src);
3822
3823             unpack_565_128_4x128 (xmm_dst,
3824                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3825             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3826
3827             /* preload next round*/
3828             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3829
3830             if (opaque)
3831             {
3832                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3833                                      &xmm_dst0, &xmm_dst1);
3834             }
3835             else if (!zero)
3836             {
3837                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3838                                         &xmm_dst0, &xmm_dst1);
3839             }
3840
3841             /* Second round */
3842             opaque = is_opaque (xmm_src);
3843             zero = is_zero (xmm_src);
3844
3845             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3846
3847             if (opaque)
3848             {
3849                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3850                                      &xmm_dst2, &xmm_dst3);
3851             }
3852             else if (!zero)
3853             {
3854                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3855                                         &xmm_dst2, &xmm_dst3);
3856             }
3857
3858             save_128_aligned (
3859                 (__m128i*)dst, pack_565_4x128_128 (
3860                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3861
3862             w -= 8;
3863             src += 8;
3864             dst += 8;
3865         }
3866
3867         while (w)
3868         {
3869             s = *src++;
3870             d = *dst;
3871
3872             ms = unpack_32_1x128 (s);
3873
3874             *dst++ = pack_565_32_16 (
3875                 pack_1x128_32 (
3876                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3877             w--;
3878         }
3879     }
3880
3881 }
3882
3883 static void
3884 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3885                                  pixman_op_t              op,
3886                                  pixman_image_t *         src_image,
3887                                  pixman_image_t *         mask_image,
3888                                  pixman_image_t *         dst_image,
3889                                  int32_t                  src_x,
3890                                  int32_t                  src_y,
3891                                  int32_t                  mask_x,
3892                                  int32_t                  mask_y,
3893                                  int32_t                  dest_x,
3894                                  int32_t                  dest_y,
3895                                  int32_t                  width,
3896                                  int32_t                  height)
3897 {
3898     uint32_t    *dst_line, *dst, d;
3899     uint32_t    *src_line, *src, s;
3900     int dst_stride, src_stride;
3901     int32_t w;
3902     uint32_t opaque, zero;
3903
3904     __m128i xmm_src_lo, xmm_src_hi;
3905     __m128i xmm_dst_lo, xmm_dst_hi;
3906
3907     PIXMAN_IMAGE_GET_LINE (
3908         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3909     PIXMAN_IMAGE_GET_LINE (
3910         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3911
3912     while (height--)
3913     {
3914         dst = dst_line;
3915         dst_line += dst_stride;
3916         src = src_line;
3917         src_line += src_stride;
3918         w = width;
3919
3920         while (w && (unsigned long)dst & 15)
3921         {
3922             s = *src++;
3923             d = *dst;
3924
3925             *dst++ = pack_1x128_32 (
3926                 over_rev_non_pre_1x128 (
3927                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3928
3929             w--;
3930         }
3931
3932         while (w >= 4)
3933         {
3934             xmm_src_hi = load_128_unaligned ((__m128i*)src);
3935
3936             opaque = is_opaque (xmm_src_hi);
3937             zero = is_zero (xmm_src_hi);
3938
3939             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3940
3941             if (opaque)
3942             {
3943                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3944                                      &xmm_dst_lo, &xmm_dst_hi);
3945
3946                 save_128_aligned (
3947                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3948             }
3949             else if (!zero)
3950             {
3951                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
3952
3953                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3954
3955                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3956                                         &xmm_dst_lo, &xmm_dst_hi);
3957
3958                 save_128_aligned (
3959                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3960             }
3961
3962             w -= 4;
3963             dst += 4;
3964             src += 4;
3965         }
3966
3967         while (w)
3968         {
3969             s = *src++;
3970             d = *dst;
3971
3972             *dst++ = pack_1x128_32 (
3973                 over_rev_non_pre_1x128 (
3974                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3975
3976             w--;
3977         }
3978     }
3979
3980 }
3981
3982 static void
3983 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3984                                     pixman_op_t              op,
3985                                     pixman_image_t *         src_image,
3986                                     pixman_image_t *         mask_image,
3987                                     pixman_image_t *         dst_image,
3988                                     int32_t                  src_x,
3989                                     int32_t                  src_y,
3990                                     int32_t                  mask_x,
3991                                     int32_t                  mask_y,
3992                                     int32_t                  dest_x,
3993                                     int32_t                  dest_y,
3994                                     int32_t                  width,
3995                                     int32_t                  height)
3996 {
3997     uint32_t src;
3998     uint16_t    *dst_line, *dst, d;
3999     uint32_t    *mask_line, *mask, m;
4000     int dst_stride, mask_stride;
4001     int w;
4002     uint32_t pack_cmp;
4003
4004     __m128i xmm_src, xmm_alpha;
4005     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4006     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4007
4008     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4009
4010     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4011
4012     if (src == 0)
4013         return;
4014
4015     PIXMAN_IMAGE_GET_LINE (
4016         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4017     PIXMAN_IMAGE_GET_LINE (
4018         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4019
4020     xmm_src = expand_pixel_32_1x128 (src);
4021     xmm_alpha = expand_alpha_1x128 (xmm_src);
4022     mmx_src = xmm_src;
4023     mmx_alpha = xmm_alpha;
4024
4025     while (height--)
4026     {
4027         w = width;
4028         mask = mask_line;
4029         dst = dst_line;
4030         mask_line += mask_stride;
4031         dst_line += dst_stride;
4032
4033         while (w && ((unsigned long)dst & 15))
4034         {
4035             m = *(uint32_t *) mask;
4036
4037             if (m)
4038             {
4039                 d = *dst;
4040                 mmx_mask = unpack_32_1x128 (m);
4041                 mmx_dest = expand565_16_1x128 (d);
4042
4043                 *dst = pack_565_32_16 (
4044                     pack_1x128_32 (
4045                         in_over_1x128 (
4046                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4047             }
4048
4049             w--;
4050             dst++;
4051             mask++;
4052         }
4053
4054         while (w >= 8)
4055         {
4056             /* First round */
4057             xmm_mask = load_128_unaligned ((__m128i*)mask);
4058             xmm_dst = load_128_aligned ((__m128i*)dst);
4059
4060             pack_cmp = _mm_movemask_epi8 (
4061                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4062
4063             unpack_565_128_4x128 (xmm_dst,
4064                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4065             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4066
4067             /* preload next round */
4068             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4069
4070             /* preload next round */
4071             if (pack_cmp != 0xffff)
4072             {
4073                 in_over_2x128 (&xmm_src, &xmm_src,
4074                                &xmm_alpha, &xmm_alpha,
4075                                &xmm_mask_lo, &xmm_mask_hi,
4076                                &xmm_dst0, &xmm_dst1);
4077             }
4078
4079             /* Second round */
4080             pack_cmp = _mm_movemask_epi8 (
4081                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4082
4083             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4084
4085             if (pack_cmp != 0xffff)
4086             {
4087                 in_over_2x128 (&xmm_src, &xmm_src,
4088                                &xmm_alpha, &xmm_alpha,
4089                                &xmm_mask_lo, &xmm_mask_hi,
4090                                &xmm_dst2, &xmm_dst3);
4091             }
4092
4093             save_128_aligned (
4094                 (__m128i*)dst, pack_565_4x128_128 (
4095                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4096
4097             w -= 8;
4098             dst += 8;
4099             mask += 8;
4100         }
4101
4102         while (w)
4103         {
4104             m = *(uint32_t *) mask;
4105
4106             if (m)
4107             {
4108                 d = *dst;
4109                 mmx_mask = unpack_32_1x128 (m);
4110                 mmx_dest = expand565_16_1x128 (d);
4111
4112                 *dst = pack_565_32_16 (
4113                     pack_1x128_32 (
4114                         in_over_1x128 (
4115                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4116             }
4117
4118             w--;
4119             dst++;
4120             mask++;
4121         }
4122     }
4123
4124 }
4125
4126 static void
4127 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4128                          pixman_op_t              op,
4129                          pixman_image_t *         src_image,
4130                          pixman_image_t *         mask_image,
4131                          pixman_image_t *         dst_image,
4132                          int32_t                  src_x,
4133                          int32_t                  src_y,
4134                          int32_t                  mask_x,
4135                          int32_t                  mask_y,
4136                          int32_t                  dest_x,
4137                          int32_t                  dest_y,
4138                          int32_t                  width,
4139                          int32_t                  height)
4140 {
4141     uint8_t     *dst_line, *dst;
4142     uint8_t     *mask_line, *mask;
4143     int dst_stride, mask_stride;
4144     uint32_t d, m;
4145     uint32_t src;
4146     uint8_t sa;
4147     int32_t w;
4148
4149     __m128i xmm_alpha;
4150     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4151     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4152
4153     PIXMAN_IMAGE_GET_LINE (
4154         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4155     PIXMAN_IMAGE_GET_LINE (
4156         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4157
4158     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4159
4160     sa = src >> 24;
4161
4162     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4163
4164     while (height--)
4165     {
4166         dst = dst_line;
4167         dst_line += dst_stride;
4168         mask = mask_line;
4169         mask_line += mask_stride;
4170         w = width;
4171
4172         while (w && ((unsigned long)dst & 15))
4173         {
4174             m = (uint32_t) *mask++;
4175             d = (uint32_t) *dst;
4176
4177             *dst++ = (uint8_t) pack_1x128_32 (
4178                 pix_multiply_1x128 (
4179                     pix_multiply_1x128 (xmm_alpha,
4180                                        unpack_32_1x128 (m)),
4181                     unpack_32_1x128 (d)));
4182             w--;
4183         }
4184
4185         while (w >= 16)
4186         {
4187             xmm_mask = load_128_unaligned ((__m128i*)mask);
4188             xmm_dst = load_128_aligned ((__m128i*)dst);
4189
4190             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4191             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4192
4193             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4194                                 &xmm_mask_lo, &xmm_mask_hi,
4195                                 &xmm_mask_lo, &xmm_mask_hi);
4196
4197             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4198                                 &xmm_dst_lo, &xmm_dst_hi,
4199                                 &xmm_dst_lo, &xmm_dst_hi);
4200
4201             save_128_aligned (
4202                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4203
4204             mask += 16;
4205             dst += 16;
4206             w -= 16;
4207         }
4208
4209         while (w)
4210         {
4211             m = (uint32_t) *mask++;
4212             d = (uint32_t) *dst;
4213
4214             *dst++ = (uint8_t) pack_1x128_32 (
4215                 pix_multiply_1x128 (
4216                     pix_multiply_1x128 (
4217                         xmm_alpha, unpack_32_1x128 (m)),
4218                     unpack_32_1x128 (d)));
4219             w--;
4220         }
4221     }
4222
4223 }
4224
4225 static void
4226 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4227                        pixman_op_t              op,
4228                        pixman_image_t *         src_image,
4229                        pixman_image_t *         mask_image,
4230                        pixman_image_t *         dst_image,
4231                        int32_t                  src_x,
4232                        int32_t                  src_y,
4233                        int32_t                  mask_x,
4234                        int32_t                  mask_y,
4235                        int32_t                  dest_x,
4236                        int32_t                  dest_y,
4237                        int32_t                  width,
4238                        int32_t                  height)
4239 {
4240     uint8_t     *dst_line, *dst;
4241     int dst_stride;
4242     uint32_t d;
4243     uint32_t src;
4244     int32_t w;
4245
4246     __m128i xmm_alpha;
4247     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4248
4249     PIXMAN_IMAGE_GET_LINE (
4250         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4251
4252     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4253
4254     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4255
4256     src = src >> 24;
4257
4258     if (src == 0xff)
4259         return;
4260
4261     if (src == 0x00)
4262     {
4263         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4264                      8, dest_x, dest_y, width, height, src);
4265
4266         return;
4267     }
4268
4269     while (height--)
4270     {
4271         dst = dst_line;
4272         dst_line += dst_stride;
4273         w = width;
4274
4275         while (w && ((unsigned long)dst & 15))
4276         {
4277             d = (uint32_t) *dst;
4278
4279             *dst++ = (uint8_t) pack_1x128_32 (
4280                 pix_multiply_1x128 (
4281                     xmm_alpha,
4282                     unpack_32_1x128 (d)));
4283             w--;
4284         }
4285
4286         while (w >= 16)
4287         {
4288             xmm_dst = load_128_aligned ((__m128i*)dst);
4289
4290             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4291             
4292             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4293                                 &xmm_dst_lo, &xmm_dst_hi,
4294                                 &xmm_dst_lo, &xmm_dst_hi);
4295
4296             save_128_aligned (
4297                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4298
4299             dst += 16;
4300             w -= 16;
4301         }
4302
4303         while (w)
4304         {
4305             d = (uint32_t) *dst;
4306
4307             *dst++ = (uint8_t) pack_1x128_32 (
4308                 pix_multiply_1x128 (
4309                     xmm_alpha,
4310                     unpack_32_1x128 (d)));
4311             w--;
4312         }
4313     }
4314
4315 }
4316
4317 static void
4318 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4319                        pixman_op_t              op,
4320                        pixman_image_t *         src_image,
4321                        pixman_image_t *         mask_image,
4322                        pixman_image_t *         dst_image,
4323                        int32_t                  src_x,
4324                        int32_t                  src_y,
4325                        int32_t                  mask_x,
4326                        int32_t                  mask_y,
4327                        int32_t                  dest_x,
4328                        int32_t                  dest_y,
4329                        int32_t                  width,
4330                        int32_t                  height)
4331 {
4332     uint8_t     *dst_line, *dst;
4333     uint8_t     *src_line, *src;
4334     int src_stride, dst_stride;
4335     int32_t w;
4336     uint32_t s, d;
4337
4338     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4339     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4340
4341     PIXMAN_IMAGE_GET_LINE (
4342         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4343     PIXMAN_IMAGE_GET_LINE (
4344         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4345
4346     while (height--)
4347     {
4348         dst = dst_line;
4349         dst_line += dst_stride;
4350         src = src_line;
4351         src_line += src_stride;
4352         w = width;
4353
4354         while (w && ((unsigned long)dst & 15))
4355         {
4356             s = (uint32_t) *src++;
4357             d = (uint32_t) *dst;
4358
4359             *dst++ = (uint8_t) pack_1x128_32 (
4360                 pix_multiply_1x128 (
4361                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4362             w--;
4363         }
4364
4365         while (w >= 16)
4366         {
4367             xmm_src = load_128_unaligned ((__m128i*)src);
4368             xmm_dst = load_128_aligned ((__m128i*)dst);
4369
4370             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4371             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4372
4373             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4374                                 &xmm_dst_lo, &xmm_dst_hi,
4375                                 &xmm_dst_lo, &xmm_dst_hi);
4376
4377             save_128_aligned (
4378                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4379
4380             src += 16;
4381             dst += 16;
4382             w -= 16;
4383         }
4384
4385         while (w)
4386         {
4387             s = (uint32_t) *src++;
4388             d = (uint32_t) *dst;
4389
4390             *dst++ = (uint8_t) pack_1x128_32 (
4391                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4392             w--;
4393         }
4394     }
4395
4396 }
4397
4398 static void
4399 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4400                           pixman_op_t              op,
4401                           pixman_image_t *         src_image,
4402                           pixman_image_t *         mask_image,
4403                           pixman_image_t *         dst_image,
4404                           int32_t                  src_x,
4405                           int32_t                  src_y,
4406                           int32_t                  mask_x,
4407                           int32_t                  mask_y,
4408                           int32_t                  dest_x,
4409                           int32_t                  dest_y,
4410                           int32_t                  width,
4411                           int32_t                  height)
4412 {
4413     uint8_t     *dst_line, *dst;
4414     uint8_t     *mask_line, *mask;
4415     int dst_stride, mask_stride;
4416     int32_t w;
4417     uint32_t src;
4418     uint8_t sa;
4419     uint32_t m, d;
4420
4421     __m128i xmm_alpha;
4422     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4423     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4424
4425     PIXMAN_IMAGE_GET_LINE (
4426         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4427     PIXMAN_IMAGE_GET_LINE (
4428         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4429
4430     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4431
4432     sa = src >> 24;
4433
4434     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4435
4436     while (height--)
4437     {
4438         dst = dst_line;
4439         dst_line += dst_stride;
4440         mask = mask_line;
4441         mask_line += mask_stride;
4442         w = width;
4443
4444         while (w && ((unsigned long)dst & 15))
4445         {
4446             m = (uint32_t) *mask++;
4447             d = (uint32_t) *dst;
4448
4449             *dst++ = (uint8_t) pack_1x128_32 (
4450                 _mm_adds_epu16 (
4451                     pix_multiply_1x128 (
4452                         xmm_alpha, unpack_32_1x128 (m)),
4453                     unpack_32_1x128 (d)));
4454             w--;
4455         }
4456
4457         while (w >= 16)
4458         {
4459             xmm_mask = load_128_unaligned ((__m128i*)mask);
4460             xmm_dst = load_128_aligned ((__m128i*)dst);
4461
4462             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4463             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4464
4465             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4466                                 &xmm_mask_lo, &xmm_mask_hi,
4467                                 &xmm_mask_lo, &xmm_mask_hi);
4468
4469             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4470             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4471
4472             save_128_aligned (
4473                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4474
4475             mask += 16;
4476             dst += 16;
4477             w -= 16;
4478         }
4479
4480         while (w)
4481         {
4482             m = (uint32_t) *mask++;
4483             d = (uint32_t) *dst;
4484
4485             *dst++ = (uint8_t) pack_1x128_32 (
4486                 _mm_adds_epu16 (
4487                     pix_multiply_1x128 (
4488                         xmm_alpha, unpack_32_1x128 (m)),
4489                     unpack_32_1x128 (d)));
4490
4491             w--;
4492         }
4493     }
4494
4495 }
4496
4497 static void
4498 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4499                         pixman_op_t              op,
4500                         pixman_image_t *         src_image,
4501                         pixman_image_t *         mask_image,
4502                         pixman_image_t *         dst_image,
4503                         int32_t                  src_x,
4504                         int32_t                  src_y,
4505                         int32_t                  mask_x,
4506                         int32_t                  mask_y,
4507                         int32_t                  dest_x,
4508                         int32_t                  dest_y,
4509                         int32_t                  width,
4510                         int32_t                  height)
4511 {
4512     uint8_t     *dst_line, *dst;
4513     int dst_stride;
4514     int32_t w;
4515     uint32_t src;
4516
4517     __m128i xmm_src;
4518
4519     PIXMAN_IMAGE_GET_LINE (
4520         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4521
4522     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4523
4524     src >>= 24;
4525
4526     if (src == 0x00)
4527         return;
4528
4529     if (src == 0xff)
4530     {
4531         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4532                      8, dest_x, dest_y, width, height, 0xff);
4533
4534         return;
4535     }
4536
4537     src = (src << 24) | (src << 16) | (src << 8) | src;
4538     xmm_src = _mm_set_epi32 (src, src, src, src);
4539
4540     while (height--)
4541     {
4542         dst = dst_line;
4543         dst_line += dst_stride;
4544         w = width;
4545
4546         while (w && ((unsigned long)dst & 15))
4547         {
4548             *dst = (uint8_t)_mm_cvtsi128_si32 (
4549                 _mm_adds_epu8 (
4550                     xmm_src,
4551                     _mm_cvtsi32_si128 (*dst)));
4552
4553             w--;
4554             dst++;
4555         }
4556
4557         while (w >= 16)
4558         {
4559             save_128_aligned (
4560                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4561
4562             dst += 16;
4563             w -= 16;
4564         }
4565
4566         while (w)
4567         {
4568             *dst = (uint8_t)_mm_cvtsi128_si32 (
4569                 _mm_adds_epu8 (
4570                     xmm_src,
4571                     _mm_cvtsi32_si128 (*dst)));
4572
4573             w--;
4574             dst++;
4575         }
4576     }
4577
4578 }
4579
4580 static void
4581 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4582                         pixman_op_t              op,
4583                         pixman_image_t *         src_image,
4584                         pixman_image_t *         mask_image,
4585                         pixman_image_t *         dst_image,
4586                         int32_t                  src_x,
4587                         int32_t                  src_y,
4588                         int32_t                  mask_x,
4589                         int32_t                  mask_y,
4590                         int32_t                  dest_x,
4591                         int32_t                  dest_y,
4592                         int32_t                  width,
4593                         int32_t                  height)
4594 {
4595     uint8_t     *dst_line, *dst;
4596     uint8_t     *src_line, *src;
4597     int dst_stride, src_stride;
4598     int32_t w;
4599     uint16_t t;
4600
4601     PIXMAN_IMAGE_GET_LINE (
4602         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4603     PIXMAN_IMAGE_GET_LINE (
4604         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4605
4606     while (height--)
4607     {
4608         dst = dst_line;
4609         src = src_line;
4610
4611         dst_line += dst_stride;
4612         src_line += src_stride;
4613         w = width;
4614
4615         /* Small head */
4616         while (w && (unsigned long)dst & 3)
4617         {
4618             t = (*dst) + (*src++);
4619             *dst++ = t | (0 - (t >> 8));
4620             w--;
4621         }
4622
4623         sse2_combine_add_u (imp, op,
4624                             (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4625
4626         /* Small tail */
4627         dst += w & 0xfffc;
4628         src += w & 0xfffc;
4629
4630         w &= 3;
4631
4632         while (w)
4633         {
4634             t = (*dst) + (*src++);
4635             *dst++ = t | (0 - (t >> 8));
4636             w--;
4637         }
4638     }
4639
4640 }
4641
4642 static void
4643 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4644                               pixman_op_t              op,
4645                               pixman_image_t *         src_image,
4646                               pixman_image_t *         mask_image,
4647                               pixman_image_t *         dst_image,
4648                               int32_t                  src_x,
4649                               int32_t                  src_y,
4650                               int32_t                  mask_x,
4651                               int32_t                  mask_y,
4652                               int32_t                  dest_x,
4653                               int32_t                  dest_y,
4654                               int32_t                  width,
4655                               int32_t                  height)
4656 {
4657     uint32_t    *dst_line, *dst;
4658     uint32_t    *src_line, *src;
4659     int dst_stride, src_stride;
4660
4661     PIXMAN_IMAGE_GET_LINE (
4662         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4663     PIXMAN_IMAGE_GET_LINE (
4664         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4665
4666     while (height--)
4667     {
4668         dst = dst_line;
4669         dst_line += dst_stride;
4670         src = src_line;
4671         src_line += src_stride;
4672
4673         sse2_combine_add_u (imp, op, dst, src, NULL, width);
4674     }
4675
4676 }
4677
4678 static pixman_bool_t
4679 pixman_blt_sse2 (uint32_t *src_bits,
4680                  uint32_t *dst_bits,
4681                  int       src_stride,
4682                  int       dst_stride,
4683                  int       src_bpp,
4684                  int       dst_bpp,
4685                  int       src_x,
4686                  int       src_y,
4687                  int       dst_x,
4688                  int       dst_y,
4689                  int       width,
4690                  int       height)
4691 {
4692     uint8_t *   src_bytes;
4693     uint8_t *   dst_bytes;
4694     int byte_width;
4695
4696     if (src_bpp != dst_bpp)
4697         return FALSE;
4698
4699     if (src_bpp == 16)
4700     {
4701         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4702         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4703         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4704         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4705         byte_width = 2 * width;
4706         src_stride *= 2;
4707         dst_stride *= 2;
4708     }
4709     else if (src_bpp == 32)
4710     {
4711         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4712         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4713         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4714         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4715         byte_width = 4 * width;
4716         src_stride *= 4;
4717         dst_stride *= 4;
4718     }
4719     else
4720     {
4721         return FALSE;
4722     }
4723
4724     while (height--)
4725     {
4726         int w;
4727         uint8_t *s = src_bytes;
4728         uint8_t *d = dst_bytes;
4729         src_bytes += src_stride;
4730         dst_bytes += dst_stride;
4731         w = byte_width;
4732
4733         while (w >= 2 && ((unsigned long)d & 3))
4734         {
4735             *(uint16_t *)d = *(uint16_t *)s;
4736             w -= 2;
4737             s += 2;
4738             d += 2;
4739         }
4740
4741         while (w >= 4 && ((unsigned long)d & 15))
4742         {
4743             *(uint32_t *)d = *(uint32_t *)s;
4744
4745             w -= 4;
4746             s += 4;
4747             d += 4;
4748         }
4749
4750         while (w >= 64)
4751         {
4752             __m128i xmm0, xmm1, xmm2, xmm3;
4753
4754             xmm0 = load_128_unaligned ((__m128i*)(s));
4755             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4756             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4757             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4758
4759             save_128_aligned ((__m128i*)(d),    xmm0);
4760             save_128_aligned ((__m128i*)(d + 16), xmm1);
4761             save_128_aligned ((__m128i*)(d + 32), xmm2);
4762             save_128_aligned ((__m128i*)(d + 48), xmm3);
4763
4764             s += 64;
4765             d += 64;
4766             w -= 64;
4767         }
4768
4769         while (w >= 16)
4770         {
4771             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4772
4773             w -= 16;
4774             d += 16;
4775             s += 16;
4776         }
4777
4778         while (w >= 4)
4779         {
4780             *(uint32_t *)d = *(uint32_t *)s;
4781
4782             w -= 4;
4783             s += 4;
4784             d += 4;
4785         }
4786
4787         if (w >= 2)
4788         {
4789             *(uint16_t *)d = *(uint16_t *)s;
4790             w -= 2;
4791             s += 2;
4792             d += 2;
4793         }
4794     }
4795
4796
4797     return TRUE;
4798 }
4799
4800 static void
4801 sse2_composite_copy_area (pixman_implementation_t *imp,
4802                           pixman_op_t              op,
4803                           pixman_image_t *         src_image,
4804                           pixman_image_t *         mask_image,
4805                           pixman_image_t *         dst_image,
4806                           int32_t                  src_x,
4807                           int32_t                  src_y,
4808                           int32_t                  mask_x,
4809                           int32_t                  mask_y,
4810                           int32_t                  dest_x,
4811                           int32_t                  dest_y,
4812                           int32_t                  width,
4813                           int32_t                  height)
4814 {
4815     pixman_blt_sse2 (src_image->bits.bits,
4816                      dst_image->bits.bits,
4817                      src_image->bits.rowstride,
4818                      dst_image->bits.rowstride,
4819                      PIXMAN_FORMAT_BPP (src_image->bits.format),
4820                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
4821                      src_x, src_y, dest_x, dest_y, width, height);
4822 }
4823
4824 static void
4825 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4826                                  pixman_op_t              op,
4827                                  pixman_image_t *         src_image,
4828                                  pixman_image_t *         mask_image,
4829                                  pixman_image_t *         dst_image,
4830                                  int32_t                  src_x,
4831                                  int32_t                  src_y,
4832                                  int32_t                  mask_x,
4833                                  int32_t                  mask_y,
4834                                  int32_t                  dest_x,
4835                                  int32_t                  dest_y,
4836                                  int32_t                  width,
4837                                  int32_t                  height)
4838 {
4839     uint32_t    *src, *src_line, s;
4840     uint32_t    *dst, *dst_line, d;
4841     uint8_t         *mask, *mask_line;
4842     uint32_t m;
4843     int src_stride, mask_stride, dst_stride;
4844     int32_t w;
4845     __m128i ms;
4846
4847     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4848     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4849     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4850
4851     PIXMAN_IMAGE_GET_LINE (
4852         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4853     PIXMAN_IMAGE_GET_LINE (
4854         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4855     PIXMAN_IMAGE_GET_LINE (
4856         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4857
4858     while (height--)
4859     {
4860         src = src_line;
4861         src_line += src_stride;
4862         dst = dst_line;
4863         dst_line += dst_stride;
4864         mask = mask_line;
4865         mask_line += mask_stride;
4866
4867         w = width;
4868
4869         while (w && (unsigned long)dst & 15)
4870         {
4871             s = 0xff000000 | *src++;
4872             m = (uint32_t) *mask++;
4873             d = *dst;
4874             ms = unpack_32_1x128 (s);
4875
4876             if (m != 0xff)
4877             {
4878                 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4879                 __m128i md = unpack_32_1x128 (d);
4880
4881                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4882             }
4883
4884             *dst++ = pack_1x128_32 (ms);
4885             w--;
4886         }
4887
4888         while (w >= 4)
4889         {
4890             m = *(uint32_t*) mask;
4891             xmm_src = _mm_or_si128 (
4892                 load_128_unaligned ((__m128i*)src), mask_ff000000);
4893
4894             if (m == 0xffffffff)
4895             {
4896                 save_128_aligned ((__m128i*)dst, xmm_src);
4897             }
4898             else
4899             {
4900                 xmm_dst = load_128_aligned ((__m128i*)dst);
4901
4902                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4903
4904                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4905                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4906                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4907
4908                 expand_alpha_rev_2x128 (
4909                     xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4910
4911                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4912                                &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4913                                &xmm_dst_lo, &xmm_dst_hi);
4914
4915                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4916             }
4917
4918             src += 4;
4919             dst += 4;
4920             mask += 4;
4921             w -= 4;
4922         }
4923
4924         while (w)
4925         {
4926             m = (uint32_t) *mask++;
4927
4928             if (m)
4929             {
4930                 s = 0xff000000 | *src;
4931
4932                 if (m == 0xff)
4933                 {
4934                     *dst = s;
4935                 }
4936                 else
4937                 {
4938                     __m128i ma, md, ms;
4939
4940                     d = *dst;
4941
4942                     ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4943                     md = unpack_32_1x128 (d);
4944                     ms = unpack_32_1x128 (s);
4945
4946                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4947                 }
4948
4949             }
4950
4951             src++;
4952             dst++;
4953             w--;
4954         }
4955     }
4956
4957 }
4958
4959 static void
4960 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4961                                  pixman_op_t              op,
4962                                  pixman_image_t *         src_image,
4963                                  pixman_image_t *         mask_image,
4964                                  pixman_image_t *         dst_image,
4965                                  int32_t                  src_x,
4966                                  int32_t                  src_y,
4967                                  int32_t                  mask_x,
4968                                  int32_t                  mask_y,
4969                                  int32_t                  dest_x,
4970                                  int32_t                  dest_y,
4971                                  int32_t                  width,
4972                                  int32_t                  height)
4973 {
4974     uint32_t    *src, *src_line, s;
4975     uint32_t    *dst, *dst_line, d;
4976     uint8_t         *mask, *mask_line;
4977     uint32_t m;
4978     int src_stride, mask_stride, dst_stride;
4979     int32_t w;
4980
4981     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4982     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4983     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4984
4985     PIXMAN_IMAGE_GET_LINE (
4986         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4987     PIXMAN_IMAGE_GET_LINE (
4988         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4989     PIXMAN_IMAGE_GET_LINE (
4990         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4991
4992     while (height--)
4993     {
4994         src = src_line;
4995         src_line += src_stride;
4996         dst = dst_line;
4997         dst_line += dst_stride;
4998         mask = mask_line;
4999         mask_line += mask_stride;
5000
5001         w = width;
5002
5003         while (w && (unsigned long)dst & 15)
5004         {
5005             uint32_t sa;
5006
5007             s = *src++;
5008             m = (uint32_t) *mask++;
5009             d = *dst;
5010
5011             sa = s >> 24;
5012
5013             if (m)
5014             {
5015                 if (sa == 0xff && m == 0xff)
5016                 {
5017                     *dst = s;
5018                 }
5019                 else
5020                 {
5021                     __m128i ms, md, ma, msa;
5022
5023                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5024                     ms = unpack_32_1x128 (s);
5025                     md = unpack_32_1x128 (d);
5026
5027                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5028
5029                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5030                 }
5031             }
5032
5033             dst++;
5034             w--;
5035         }
5036
5037         while (w >= 4)
5038         {
5039             m = *(uint32_t *) mask;
5040
5041             if (m)
5042             {
5043                 xmm_src = load_128_unaligned ((__m128i*)src);
5044
5045                 if (m == 0xffffffff && is_opaque (xmm_src))
5046                 {
5047                     save_128_aligned ((__m128i *)dst, xmm_src);
5048                 }
5049                 else
5050                 {
5051                     xmm_dst = load_128_aligned ((__m128i *)dst);
5052
5053                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5054
5055                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5056                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5057                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5058
5059                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5060                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5061
5062                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5063                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5064
5065                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5066                 }
5067             }
5068
5069             src += 4;
5070             dst += 4;
5071             mask += 4;
5072             w -= 4;
5073         }
5074
5075         while (w)
5076         {
5077             uint32_t sa;
5078
5079             s = *src++;
5080             m = (uint32_t) *mask++;
5081             d = *dst;
5082
5083             sa = s >> 24;
5084
5085             if (m)
5086             {
5087                 if (sa == 0xff && m == 0xff)
5088                 {
5089                     *dst = s;
5090                 }
5091                 else
5092                 {
5093                     __m128i ms, md, ma, msa;
5094
5095                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5096                     ms = unpack_32_1x128 (s);
5097                     md = unpack_32_1x128 (d);
5098
5099                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5100
5101                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5102                 }
5103             }
5104
5105             dst++;
5106             w--;
5107         }
5108     }
5109
5110 }
5111
5112 static void
5113 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5114                                     pixman_op_t              op,
5115                                     pixman_image_t *         src_image,
5116                                     pixman_image_t *         mask_image,
5117                                     pixman_image_t *         dst_image,
5118                                     int32_t                  src_x,
5119                                     int32_t                  src_y,
5120                                     int32_t                  mask_x,
5121                                     int32_t                  mask_y,
5122                                     int32_t                  dest_x,
5123                                     int32_t                  dest_y,
5124                                     int32_t                  width,
5125                                     int32_t                  height)
5126 {
5127     uint32_t src;
5128     uint32_t    *dst_line, *dst;
5129     __m128i xmm_src;
5130     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5131     __m128i xmm_dsta_hi, xmm_dsta_lo;
5132     int dst_stride;
5133     int32_t w;
5134
5135     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5136
5137     if (src == 0)
5138         return;
5139
5140     PIXMAN_IMAGE_GET_LINE (
5141         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5142
5143     xmm_src = expand_pixel_32_1x128 (src);
5144
5145     while (height--)
5146     {
5147         dst = dst_line;
5148
5149         dst_line += dst_stride;
5150         w = width;
5151
5152         while (w && (unsigned long)dst & 15)
5153         {
5154             __m128i vd;
5155
5156             vd = unpack_32_1x128 (*dst);
5157
5158             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5159                                               xmm_src));
5160             w--;
5161             dst++;
5162         }
5163
5164         while (w >= 4)
5165         {
5166             __m128i tmp_lo, tmp_hi;
5167
5168             xmm_dst = load_128_aligned ((__m128i*)dst);
5169
5170             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5171             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5172
5173             tmp_lo = xmm_src;
5174             tmp_hi = xmm_src;
5175
5176             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5177                         &xmm_dsta_lo, &xmm_dsta_hi,
5178                         &tmp_lo, &tmp_hi);
5179
5180             save_128_aligned (
5181                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5182
5183             w -= 4;
5184             dst += 4;
5185         }
5186
5187         while (w)
5188         {
5189             __m128i vd;
5190
5191             vd = unpack_32_1x128 (*dst);
5192
5193             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5194                                               xmm_src));
5195             w--;
5196             dst++;
5197         }
5198
5199     }
5200
5201 }
5202
5203 static void
5204 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5205                                     pixman_op_t              op,
5206                                     pixman_image_t *         src_image,
5207                                     pixman_image_t *         mask_image,
5208                                     pixman_image_t *         dst_image,
5209                                     int32_t                  src_x,
5210                                     int32_t                  src_y,
5211                                     int32_t                  mask_x,
5212                                     int32_t                  mask_y,
5213                                     int32_t                  dest_x,
5214                                     int32_t                  dest_y,
5215                                     int32_t                  width,
5216                                     int32_t                  height)
5217 {
5218     uint32_t    *src, *src_line, s;
5219     uint32_t    *dst, *dst_line, d;
5220     uint32_t    *mask, *mask_line;
5221     uint32_t    m;
5222     int src_stride, mask_stride, dst_stride;
5223     int32_t w;
5224
5225     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5226     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5227     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5228
5229     PIXMAN_IMAGE_GET_LINE (
5230         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5231     PIXMAN_IMAGE_GET_LINE (
5232         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5233     PIXMAN_IMAGE_GET_LINE (
5234         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5235
5236     while (height--)
5237     {
5238         src = src_line;
5239         src_line += src_stride;
5240         dst = dst_line;
5241         dst_line += dst_stride;
5242         mask = mask_line;
5243         mask_line += mask_stride;
5244
5245         w = width;
5246
5247         while (w && (unsigned long)dst & 15)
5248         {
5249             uint32_t sa;
5250
5251             s = *src++;
5252             m = (*mask++) >> 24;
5253             d = *dst;
5254
5255             sa = s >> 24;
5256
5257             if (m)
5258             {
5259                 if (sa == 0xff && m == 0xff)
5260                 {
5261                     *dst = s;
5262                 }
5263                 else
5264                 {
5265                     __m128i ms, md, ma, msa;
5266
5267                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5268                     ms = unpack_32_1x128 (s);
5269                     md = unpack_32_1x128 (d);
5270
5271                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5272
5273                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5274                 }
5275             }
5276
5277             dst++;
5278             w--;
5279         }
5280
5281         while (w >= 4)
5282         {
5283             xmm_mask = load_128_unaligned ((__m128i*)mask);
5284
5285             if (!is_transparent (xmm_mask))
5286             {
5287                 xmm_src = load_128_unaligned ((__m128i*)src);
5288
5289                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5290                 {
5291                     save_128_aligned ((__m128i *)dst, xmm_src);
5292                 }
5293                 else
5294                 {
5295                     xmm_dst = load_128_aligned ((__m128i *)dst);
5296
5297                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5298                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5299                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5300
5301                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5302                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5303
5304                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5305                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5306
5307                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5308                 }
5309             }
5310
5311             src += 4;
5312             dst += 4;
5313             mask += 4;
5314             w -= 4;
5315         }
5316
5317         while (w)
5318         {
5319             uint32_t sa;
5320
5321             s = *src++;
5322             m = (*mask++) >> 24;
5323             d = *dst;
5324
5325             sa = s >> 24;
5326
5327             if (m)
5328             {
5329                 if (sa == 0xff && m == 0xff)
5330                 {
5331                     *dst = s;
5332                 }
5333                 else
5334                 {
5335                     __m128i ms, md, ma, msa;
5336
5337                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5338                     ms = unpack_32_1x128 (s);
5339                     md = unpack_32_1x128 (d);
5340
5341                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5342
5343                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5344                 }
5345             }
5346
5347             dst++;
5348             w--;
5349         }
5350     }
5351
5352 }
5353
5354 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5355 static force_inline void
5356 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5357                                              const uint32_t* ps,
5358                                              int32_t         w,
5359                                              pixman_fixed_t  vx,
5360                                              pixman_fixed_t  unit_x,
5361                                              pixman_fixed_t  max_vx,
5362                                              pixman_bool_t   fully_transparent_src)
5363 {
5364     uint32_t s, d;
5365     const uint32_t* pm = NULL;
5366
5367     __m128i xmm_dst_lo, xmm_dst_hi;
5368     __m128i xmm_src_lo, xmm_src_hi;
5369     __m128i xmm_alpha_lo, xmm_alpha_hi;
5370
5371     if (fully_transparent_src)
5372         return;
5373
5374     /* Align dst on a 16-byte boundary */
5375     while (w && ((unsigned long)pd & 15))
5376     {
5377         d = *pd;
5378         s = combine1 (ps + (vx >> 16), pm);
5379         vx += unit_x;
5380
5381         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5382         if (pm)
5383             pm++;
5384         w--;
5385     }
5386
5387     while (w >= 4)
5388     {
5389         __m128i tmp;
5390         uint32_t tmp1, tmp2, tmp3, tmp4;
5391
5392         tmp1 = ps[vx >> 16];
5393         vx += unit_x;
5394         tmp2 = ps[vx >> 16];
5395         vx += unit_x;
5396         tmp3 = ps[vx >> 16];
5397         vx += unit_x;
5398         tmp4 = ps[vx >> 16];
5399         vx += unit_x;
5400
5401         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5402
5403         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5404
5405         if (is_opaque (xmm_src_hi))
5406         {
5407             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5408         }
5409         else if (!is_zero (xmm_src_hi))
5410         {
5411             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5412
5413             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5414             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5415
5416             expand_alpha_2x128 (
5417                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5418
5419             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5420                         &xmm_alpha_lo, &xmm_alpha_hi,
5421                         &xmm_dst_lo, &xmm_dst_hi);
5422
5423             /* rebuid the 4 pixel data and save*/
5424             save_128_aligned ((__m128i*)pd,
5425                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5426         }
5427
5428         w -= 4;
5429         pd += 4;
5430         if (pm)
5431             pm += 4;
5432     }
5433
5434     while (w)
5435     {
5436         d = *pd;
5437         s = combine1 (ps + (vx >> 16), pm);
5438         vx += unit_x;
5439
5440         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5441         if (pm)
5442             pm++;
5443
5444         w--;
5445     }
5446 }
5447
5448 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5449                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5450                        uint32_t, uint32_t, COVER)
5451 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5452                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5453                        uint32_t, uint32_t, NONE)
5454 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5455                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5456                        uint32_t, uint32_t, PAD)
5457
5458 static force_inline void
5459 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5460                                                uint32_t *       dst,
5461                                                const uint32_t * src,
5462                                                int32_t          w,
5463                                                pixman_fixed_t   vx,
5464                                                pixman_fixed_t   unit_x,
5465                                                pixman_fixed_t   max_vx,
5466                                                pixman_bool_t    zero_src)
5467 {
5468     __m128i xmm_mask;
5469     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5470     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5471     __m128i xmm_alpha_lo, xmm_alpha_hi;
5472
5473     if (zero_src || (*mask >> 24) == 0)
5474         return;
5475
5476     xmm_mask = create_mask_16_128 (*mask >> 24);
5477
5478     while (w && (unsigned long)dst & 15)
5479     {
5480         uint32_t s = src[pixman_fixed_to_int (vx)];
5481         vx += unit_x;
5482
5483         if (s)
5484         {
5485             uint32_t d = *dst;
5486
5487             __m128i ms = unpack_32_1x128 (s);
5488             __m128i alpha     = expand_alpha_1x128 (ms);
5489             __m128i dest      = xmm_mask;
5490             __m128i alpha_dst = unpack_32_1x128 (d);
5491
5492             *dst = pack_1x128_32 (
5493                 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5494         }
5495         dst++;
5496         w--;
5497     }
5498
5499     while (w >= 4)
5500     {
5501         uint32_t tmp1, tmp2, tmp3, tmp4;
5502
5503         tmp1 = src[pixman_fixed_to_int (vx)];
5504         vx += unit_x;
5505         tmp2 = src[pixman_fixed_to_int (vx)];
5506         vx += unit_x;
5507         tmp3 = src[pixman_fixed_to_int (vx)];
5508         vx += unit_x;
5509         tmp4 = src[pixman_fixed_to_int (vx)];
5510         vx += unit_x;
5511
5512         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5513
5514         if (!is_zero (xmm_src))
5515         {
5516             xmm_dst = load_128_aligned ((__m128i*)dst);
5517
5518             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5519             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5520             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5521                                 &xmm_alpha_lo, &xmm_alpha_hi);
5522
5523             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5524                            &xmm_alpha_lo, &xmm_alpha_hi,
5525                            &xmm_mask, &xmm_mask,
5526                            &xmm_dst_lo, &xmm_dst_hi);
5527
5528             save_128_aligned (
5529                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5530         }
5531
5532         dst += 4;
5533         w -= 4;
5534     }
5535
5536     while (w)
5537     {
5538         uint32_t s = src[pixman_fixed_to_int (vx)];
5539         vx += unit_x;
5540
5541         if (s)
5542         {
5543             uint32_t d = *dst;
5544
5545             __m128i ms = unpack_32_1x128 (s);
5546             __m128i alpha = expand_alpha_1x128 (ms);
5547             __m128i mask  = xmm_mask;
5548             __m128i dest  = unpack_32_1x128 (d);
5549
5550             *dst = pack_1x128_32 (
5551                 in_over_1x128 (&ms, &alpha, &mask, &dest));
5552         }
5553
5554         dst++;
5555         w--;
5556     }
5557
5558 }
5559
5560 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5561                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5562                               uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5563 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5564                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5565                               uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5566 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5567                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5568                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5569
5570 static void
5571 bilinear_interpolate_line_sse2 (uint32_t *       out,
5572                                 const uint32_t * top,
5573                                 const uint32_t * bottom,
5574                                 int              wt,
5575                                 int              wb,
5576                                 pixman_fixed_t   x,
5577                                 pixman_fixed_t   ux,
5578                                 int              width)
5579 {
5580     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
5581     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
5582     const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
5583     const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
5584     const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
5585     const __m128i xmm_zero = _mm_setzero_si128 ();
5586     __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
5587     uint32_t pix1, pix2, pix3, pix4;
5588
5589     #define INTERPOLATE_ONE_PIXEL(pix)                                          \
5590     do {                                                                        \
5591         __m128i xmm_wh, xmm_lo, xmm_hi, a;                                      \
5592         /* fetch 2x2 pixel block into sse2 register */                          \
5593         uint32_t tl = top [pixman_fixed_to_int (x)];                            \
5594         uint32_t tr = top [pixman_fixed_to_int (x) + 1];                        \
5595         uint32_t bl = bottom [pixman_fixed_to_int (x)];                         \
5596         uint32_t br = bottom [pixman_fixed_to_int (x) + 1];                     \
5597         a = _mm_set_epi32 (tr, tl, br, bl);                                     \
5598         x += ux;                                                                \
5599         /* vertical interpolation */                                            \
5600         a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),    \
5601                                             xmm_wt),                            \
5602                            _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),    \
5603                                             xmm_wb));                           \
5604         /* calculate horizontal weights */                                      \
5605         xmm_wh = _mm_add_epi16 (xmm_addc,                                       \
5606                                 _mm_xor_si128 (xmm_xorc,                        \
5607                                                _mm_srli_epi16 (xmm_x, 8)));     \
5608         xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                  \
5609         /* horizontal interpolation */                                          \
5610         xmm_lo = _mm_mullo_epi16 (a, xmm_wh);                                   \
5611         xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);                                   \
5612         a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),                 \
5613                            _mm_unpackhi_epi16 (xmm_lo, xmm_hi));                \
5614         /* shift and pack the result */                                         \
5615         a = _mm_srli_epi32 (a, 16);                                             \
5616         a = _mm_packs_epi32 (a, a);                                             \
5617         a = _mm_packus_epi16 (a, a);                                            \
5618         pix = _mm_cvtsi128_si32 (a);                                            \
5619     } while (0)
5620
5621     while ((width -= 4) >= 0)
5622     {
5623         INTERPOLATE_ONE_PIXEL (pix1);
5624         INTERPOLATE_ONE_PIXEL (pix2);
5625         INTERPOLATE_ONE_PIXEL (pix3);
5626         INTERPOLATE_ONE_PIXEL (pix4);
5627         *out++ = pix1;
5628         *out++ = pix2;
5629         *out++ = pix3;
5630         *out++ = pix4;
5631     }
5632     if (width & 2)
5633     {
5634         INTERPOLATE_ONE_PIXEL (pix1);
5635         INTERPOLATE_ONE_PIXEL (pix2);
5636         *out++ = pix1;
5637         *out++ = pix2;
5638     }
5639     if (width & 1)
5640     {
5641         INTERPOLATE_ONE_PIXEL (pix1);
5642         *out = pix1;
5643     }
5644
5645     #undef INTERPOLATE_ONE_PIXEL
5646 }
5647
5648 static force_inline void
5649 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
5650                                              const uint32_t * mask,
5651                                              const uint32_t * src_top,
5652                                              const uint32_t * src_bottom,
5653                                              int32_t          w,
5654                                              int              wt,
5655                                              int              wb,
5656                                              pixman_fixed_t   vx,
5657                                              pixman_fixed_t   unit_x,
5658                                              pixman_fixed_t   max_vx,
5659                                              pixman_bool_t    zero_src)
5660 {
5661     bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
5662                                     wt, wb, vx, unit_x, w);
5663 }
5664
5665 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5666                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5667                                uint32_t, uint32_t, uint32_t,
5668                                COVER, FALSE, FALSE)
5669 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5670                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5671                                uint32_t, uint32_t, uint32_t,
5672                                PAD, FALSE, FALSE)
5673 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5674                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5675                                uint32_t, uint32_t, uint32_t,
5676                                NONE, FALSE, FALSE)
5677
5678 static const pixman_fast_path_t sse2_fast_paths[] =
5679 {
5680     /* PIXMAN_OP_OVER */
5681     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5682     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5683     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5684     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5685     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5686     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5687     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5688     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5689     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5690     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5691     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5692     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5693     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5694     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5695     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5696     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5697     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5698     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5699     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5700     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5701     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5702     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5703     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5704     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5705     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5706     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5707     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5708     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5709     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5710     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5711     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5712     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5713     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5714     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5715     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5716     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5717     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5718     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5719     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5720     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5721     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5722     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5723     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5724     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5725     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5726     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5727     
5728     /* PIXMAN_OP_OVER_REVERSE */
5729     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5730     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5731
5732     /* PIXMAN_OP_ADD */
5733     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5734     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5735     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5736     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5737     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5738     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5739
5740     /* PIXMAN_OP_SRC */
5741     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5742     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5743     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5744     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5745     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5746     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5747     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5748     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5749     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5750     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5751     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5752     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5753     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5754     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5755
5756     /* PIXMAN_OP_IN */
5757     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5758     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5759     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5760
5761     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5762     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5763     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5764     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5765     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5766     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5767     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5768     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5769     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5770     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5771     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5772     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5773
5774     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
5775     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
5776     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
5777     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
5778
5779     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5780     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5781     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
5782
5783     { PIXMAN_OP_NONE },
5784 };
5785
5786 static pixman_bool_t
5787 sse2_blt (pixman_implementation_t *imp,
5788           uint32_t *               src_bits,
5789           uint32_t *               dst_bits,
5790           int                      src_stride,
5791           int                      dst_stride,
5792           int                      src_bpp,
5793           int                      dst_bpp,
5794           int                      src_x,
5795           int                      src_y,
5796           int                      dst_x,
5797           int                      dst_y,
5798           int                      width,
5799           int                      height)
5800 {
5801     if (!pixman_blt_sse2 (
5802             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5803             src_x, src_y, dst_x, dst_y, width, height))
5804
5805     {
5806         return _pixman_implementation_blt (
5807             imp->delegate,
5808             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5809             src_x, src_y, dst_x, dst_y, width, height);
5810     }
5811
5812     return TRUE;
5813 }
5814
5815 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5816 __attribute__((__force_align_arg_pointer__))
5817 #endif
5818 static pixman_bool_t
5819 sse2_fill (pixman_implementation_t *imp,
5820            uint32_t *               bits,
5821            int                      stride,
5822            int                      bpp,
5823            int                      x,
5824            int                      y,
5825            int                      width,
5826            int                      height,
5827            uint32_t xor)
5828 {
5829     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5830     {
5831         return _pixman_implementation_fill (
5832             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5833     }
5834
5835     return TRUE;
5836 }
5837
5838 static uint32_t *
5839 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5840 {
5841     int w = iter->width;
5842     __m128i ff000000 = mask_ff000000;
5843     uint32_t *dst = iter->buffer;
5844     uint32_t *src = (uint32_t *)iter->bits;
5845
5846     iter->bits += iter->stride;
5847
5848     while (w && ((unsigned long)dst) & 0x0f)
5849     {
5850         *dst++ = (*src++) | 0xff000000;
5851         w--;
5852     }
5853
5854     while (w >= 4)
5855     {
5856         save_128_aligned (
5857             (__m128i *)dst, _mm_or_si128 (
5858                 load_128_unaligned ((__m128i *)src), ff000000));
5859
5860         dst += 4;
5861         src += 4;
5862         w -= 4;
5863     }
5864
5865     while (w)
5866     {
5867         *dst++ = (*src++) | 0xff000000;
5868         w--;
5869     }
5870
5871     return iter->buffer;
5872 }
5873
5874 static uint32_t *
5875 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
5876 {
5877     int w = iter->width;
5878     uint32_t *dst = iter->buffer;
5879     uint16_t *src = (uint16_t *)iter->bits;
5880     __m128i ff000000 = mask_ff000000;
5881
5882     iter->bits += iter->stride;
5883
5884     while (w && ((unsigned long)dst) & 0x0f)
5885     {
5886         uint16_t s = *src++;
5887
5888         *dst++ = CONVERT_0565_TO_8888 (s);
5889         w--;
5890     }
5891
5892     while (w >= 8)
5893     {
5894         __m128i lo, hi, s;
5895
5896         s = _mm_loadu_si128 ((__m128i *)src);
5897
5898         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
5899         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
5900
5901         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
5902         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
5903
5904         dst += 8;
5905         src += 8;
5906         w -= 8;
5907     }
5908
5909     while (w)
5910     {
5911         uint16_t s = *src++;
5912
5913         *dst++ = CONVERT_0565_TO_8888 (s);
5914         w--;
5915     }
5916
5917     return iter->buffer;
5918 }
5919
5920 static uint32_t *
5921 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5922 {
5923     int w = iter->width;
5924     uint32_t *dst = iter->buffer;
5925     uint8_t *src = iter->bits;
5926     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5927
5928     iter->bits += iter->stride;
5929
5930     while (w && (((unsigned long)dst) & 15))
5931     {
5932         *dst++ = *(src++) << 24;
5933         w--;
5934     }
5935
5936     while (w >= 16)
5937     {
5938         xmm0 = _mm_loadu_si128((__m128i *)src);
5939
5940         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
5941         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
5942         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
5943         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
5944         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
5945         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
5946
5947         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
5948         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
5949         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
5950         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
5951
5952         dst += 16;
5953         src += 16;
5954         w -= 16;
5955     }
5956
5957     while (w)
5958     {
5959         *dst++ = *(src++) << 24;
5960         w--;
5961     }
5962
5963     return iter->buffer;
5964 }
5965
5966 typedef struct
5967 {
5968     pixman_format_code_t        format;
5969     pixman_iter_get_scanline_t  get_scanline;
5970 } fetcher_info_t;
5971
5972 static const fetcher_info_t fetchers[] =
5973 {
5974     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
5975     { PIXMAN_r5g6b5,            sse2_fetch_r5g6b5 },
5976     { PIXMAN_a8,                sse2_fetch_a8 },
5977     { PIXMAN_null }
5978 };
5979
5980 static void
5981 sse2_src_iter_init (pixman_implementation_t *imp,
5982                     pixman_iter_t *iter,
5983                     pixman_image_t *image,
5984                     int x, int y, int width, int height,
5985                     uint8_t *buffer, iter_flags_t flags)
5986 {
5987 #define FLAGS                                                           \
5988     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
5989
5990     if ((flags & ITER_NARROW)                           &&
5991         (image->common.flags & FLAGS) == FLAGS          &&
5992         x >= 0 && y >= 0                                &&
5993         x + width <= image->bits.width                  &&
5994         y + height <= image->bits.height)
5995     {
5996         const fetcher_info_t *f;
5997
5998         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
5999         {
6000             if (image->common.extended_format_code == f->format)
6001             {
6002                 uint8_t *b = (uint8_t *)image->bits.bits;
6003                 int s = image->bits.rowstride * 4;
6004
6005                 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6006                 iter->stride = s;
6007
6008                 iter->get_scanline = f->get_scanline;
6009                 return;
6010             }
6011         }
6012     }
6013
6014     imp->delegate->src_iter_init (
6015         imp->delegate, iter, image, x, y, width, height, buffer, flags);
6016 }
6017
6018 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6019 __attribute__((__force_align_arg_pointer__))
6020 #endif
6021 pixman_implementation_t *
6022 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6023 {
6024     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6025
6026     /* SSE2 constants */
6027     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6028     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6029     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6030     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6031     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6032     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6033     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6034     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6035     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6036     mask_0080 = create_mask_16_128 (0x0080);
6037     mask_00ff = create_mask_16_128 (0x00ff);
6038     mask_0101 = create_mask_16_128 (0x0101);
6039     mask_ffff = create_mask_16_128 (0xffff);
6040     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6041     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6042
6043     /* Set up function pointers */
6044     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6045     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6046     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6047     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6048     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6049     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6050     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6051     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6052     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6053     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6054
6055     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6056
6057     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6058     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6059     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6060     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6061     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6062     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6063     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6064     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6065     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6066     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6067     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6068
6069     imp->blt = sse2_blt;
6070     imp->fill = sse2_fill;
6071
6072     imp->src_iter_init = sse2_src_iter_init;
6073
6074     return imp;
6075 }