Add packaging files for Tizen
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
34 #include <emmintrin.h> /* for SSE2 intrinsics */
35 #include "pixman-private.h"
36 #include "pixman-combine32.h"
37 #include "pixman-inlines.h"
38
39 static __m128i mask_0080;
40 static __m128i mask_00ff;
41 static __m128i mask_0101;
42 static __m128i mask_ffff;
43 static __m128i mask_ff000000;
44 static __m128i mask_alpha;
45
46 static __m128i mask_565_r;
47 static __m128i mask_565_g1, mask_565_g2;
48 static __m128i mask_565_b;
49 static __m128i mask_red;
50 static __m128i mask_green;
51 static __m128i mask_blue;
52
53 static __m128i mask_565_fix_rb;
54 static __m128i mask_565_fix_g;
55
56 static force_inline __m128i
57 unpack_32_1x128 (uint32_t data)
58 {
59     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
60 }
61
62 static force_inline void
63 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
64 {
65     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
66     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
67 }
68
69 static force_inline __m128i
70 unpack_565_to_8888 (__m128i lo)
71 {
72     __m128i r, g, b, rb, t;
73
74     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
75     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
76     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
77
78     rb = _mm_or_si128 (r, b);
79     t  = _mm_and_si128 (rb, mask_565_fix_rb);
80     t  = _mm_srli_epi32 (t, 5);
81     rb = _mm_or_si128 (rb, t);
82
83     t  = _mm_and_si128 (g, mask_565_fix_g);
84     t  = _mm_srli_epi32 (t, 6);
85     g  = _mm_or_si128 (g, t);
86
87     return _mm_or_si128 (rb, g);
88 }
89
90 static force_inline void
91 unpack_565_128_4x128 (__m128i  data,
92                       __m128i* data0,
93                       __m128i* data1,
94                       __m128i* data2,
95                       __m128i* data3)
96 {
97     __m128i lo, hi;
98
99     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
100     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
101
102     lo = unpack_565_to_8888 (lo);
103     hi = unpack_565_to_8888 (hi);
104
105     unpack_128_2x128 (lo, data0, data1);
106     unpack_128_2x128 (hi, data2, data3);
107 }
108
109 static force_inline uint16_t
110 pack_565_32_16 (uint32_t pixel)
111 {
112     return (uint16_t) (((pixel >> 8) & 0xf800) |
113                        ((pixel >> 5) & 0x07e0) |
114                        ((pixel >> 3) & 0x001f));
115 }
116
117 static force_inline __m128i
118 pack_2x128_128 (__m128i lo, __m128i hi)
119 {
120     return _mm_packus_epi16 (lo, hi);
121 }
122
123 static force_inline __m128i
124 pack_565_2x128_128 (__m128i lo, __m128i hi)
125 {
126     __m128i data;
127     __m128i r, g1, g2, b;
128
129     data = pack_2x128_128 (lo, hi);
130
131     r  = _mm_and_si128 (data, mask_565_r);
132     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
133     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
134     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
135
136     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
137 }
138
139 static force_inline __m128i
140 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
141 {
142     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
143                              pack_565_2x128_128 (*xmm2, *xmm3));
144 }
145
146 static force_inline int
147 is_opaque (__m128i x)
148 {
149     __m128i ffs = _mm_cmpeq_epi8 (x, x);
150
151     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
152 }
153
154 static force_inline int
155 is_zero (__m128i x)
156 {
157     return _mm_movemask_epi8 (
158         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
159 }
160
161 static force_inline int
162 is_transparent (__m128i x)
163 {
164     return (_mm_movemask_epi8 (
165                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
166 }
167
168 static force_inline __m128i
169 expand_pixel_32_1x128 (uint32_t data)
170 {
171     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
172 }
173
174 static force_inline __m128i
175 expand_alpha_1x128 (__m128i data)
176 {
177     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
178                                                      _MM_SHUFFLE (3, 3, 3, 3)),
179                                 _MM_SHUFFLE (3, 3, 3, 3));
180 }
181
182 static force_inline void
183 expand_alpha_2x128 (__m128i  data_lo,
184                     __m128i  data_hi,
185                     __m128i* alpha_lo,
186                     __m128i* alpha_hi)
187 {
188     __m128i lo, hi;
189
190     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
191     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
192
193     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
194     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
195 }
196
197 static force_inline void
198 expand_alpha_rev_2x128 (__m128i  data_lo,
199                         __m128i  data_hi,
200                         __m128i* alpha_lo,
201                         __m128i* alpha_hi)
202 {
203     __m128i lo, hi;
204
205     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
206     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
207     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
208     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
209 }
210
211 static force_inline void
212 pix_multiply_2x128 (__m128i* data_lo,
213                     __m128i* data_hi,
214                     __m128i* alpha_lo,
215                     __m128i* alpha_hi,
216                     __m128i* ret_lo,
217                     __m128i* ret_hi)
218 {
219     __m128i lo, hi;
220
221     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
222     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
223     lo = _mm_adds_epu16 (lo, mask_0080);
224     hi = _mm_adds_epu16 (hi, mask_0080);
225     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
226     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
227 }
228
229 static force_inline void
230 pix_add_multiply_2x128 (__m128i* src_lo,
231                         __m128i* src_hi,
232                         __m128i* alpha_dst_lo,
233                         __m128i* alpha_dst_hi,
234                         __m128i* dst_lo,
235                         __m128i* dst_hi,
236                         __m128i* alpha_src_lo,
237                         __m128i* alpha_src_hi,
238                         __m128i* ret_lo,
239                         __m128i* ret_hi)
240 {
241     __m128i t1_lo, t1_hi;
242     __m128i t2_lo, t2_hi;
243
244     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
245     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
246
247     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
248     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
249 }
250
251 static force_inline void
252 negate_2x128 (__m128i  data_lo,
253               __m128i  data_hi,
254               __m128i* neg_lo,
255               __m128i* neg_hi)
256 {
257     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
258     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
259 }
260
261 static force_inline void
262 invert_colors_2x128 (__m128i  data_lo,
263                      __m128i  data_hi,
264                      __m128i* inv_lo,
265                      __m128i* inv_hi)
266 {
267     __m128i lo, hi;
268
269     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
270     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
271     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
272     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
273 }
274
275 static force_inline void
276 over_2x128 (__m128i* src_lo,
277             __m128i* src_hi,
278             __m128i* alpha_lo,
279             __m128i* alpha_hi,
280             __m128i* dst_lo,
281             __m128i* dst_hi)
282 {
283     __m128i t1, t2;
284
285     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
286
287     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
288
289     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
290     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
291 }
292
293 static force_inline void
294 over_rev_non_pre_2x128 (__m128i  src_lo,
295                         __m128i  src_hi,
296                         __m128i* dst_lo,
297                         __m128i* dst_hi)
298 {
299     __m128i lo, hi;
300     __m128i alpha_lo, alpha_hi;
301
302     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
303
304     lo = _mm_or_si128 (alpha_lo, mask_alpha);
305     hi = _mm_or_si128 (alpha_hi, mask_alpha);
306
307     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
308
309     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
310
311     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
312 }
313
314 static force_inline void
315 in_over_2x128 (__m128i* src_lo,
316                __m128i* src_hi,
317                __m128i* alpha_lo,
318                __m128i* alpha_hi,
319                __m128i* mask_lo,
320                __m128i* mask_hi,
321                __m128i* dst_lo,
322                __m128i* dst_hi)
323 {
324     __m128i s_lo, s_hi;
325     __m128i a_lo, a_hi;
326
327     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
328     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
329
330     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
331 }
332
333 /* load 4 pixels from a 16-byte boundary aligned address */
334 static force_inline __m128i
335 load_128_aligned (__m128i* src)
336 {
337     return _mm_load_si128 (src);
338 }
339
340 /* load 4 pixels from a unaligned address */
341 static force_inline __m128i
342 load_128_unaligned (const __m128i* src)
343 {
344     return _mm_loadu_si128 (src);
345 }
346
347 /* save 4 pixels using Write Combining memory on a 16-byte
348  * boundary aligned address
349  */
350 static force_inline void
351 save_128_write_combining (__m128i* dst,
352                           __m128i  data)
353 {
354     _mm_stream_si128 (dst, data);
355 }
356
357 /* save 4 pixels on a 16-byte boundary aligned address */
358 static force_inline void
359 save_128_aligned (__m128i* dst,
360                   __m128i  data)
361 {
362     _mm_store_si128 (dst, data);
363 }
364
365 /* save 4 pixels on a unaligned address */
366 static force_inline void
367 save_128_unaligned (__m128i* dst,
368                     __m128i  data)
369 {
370     _mm_storeu_si128 (dst, data);
371 }
372
373 static force_inline __m128i
374 load_32_1x128 (uint32_t data)
375 {
376     return _mm_cvtsi32_si128 (data);
377 }
378
379 static force_inline __m128i
380 expand_alpha_rev_1x128 (__m128i data)
381 {
382     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
383 }
384
385 static force_inline __m128i
386 expand_pixel_8_1x128 (uint8_t data)
387 {
388     return _mm_shufflelo_epi16 (
389         unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
390 }
391
392 static force_inline __m128i
393 pix_multiply_1x128 (__m128i data,
394                     __m128i alpha)
395 {
396     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
397                                             mask_0080),
398                             mask_0101);
399 }
400
401 static force_inline __m128i
402 pix_add_multiply_1x128 (__m128i* src,
403                         __m128i* alpha_dst,
404                         __m128i* dst,
405                         __m128i* alpha_src)
406 {
407     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
408     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
409
410     return _mm_adds_epu8 (t1, t2);
411 }
412
413 static force_inline __m128i
414 negate_1x128 (__m128i data)
415 {
416     return _mm_xor_si128 (data, mask_00ff);
417 }
418
419 static force_inline __m128i
420 invert_colors_1x128 (__m128i data)
421 {
422     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
423 }
424
425 static force_inline __m128i
426 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
427 {
428     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
429 }
430
431 static force_inline __m128i
432 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
433 {
434     return over_1x128 (pix_multiply_1x128 (*src, *mask),
435                        pix_multiply_1x128 (*alpha, *mask),
436                        *dst);
437 }
438
439 static force_inline __m128i
440 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
441 {
442     __m128i alpha = expand_alpha_1x128 (src);
443
444     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
445                                            _mm_or_si128 (alpha, mask_alpha)),
446                        alpha,
447                        dst);
448 }
449
450 static force_inline uint32_t
451 pack_1x128_32 (__m128i data)
452 {
453     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
454 }
455
456 static force_inline __m128i
457 expand565_16_1x128 (uint16_t pixel)
458 {
459     __m128i m = _mm_cvtsi32_si128 (pixel);
460
461     m = unpack_565_to_8888 (m);
462
463     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
464 }
465
466 static force_inline uint32_t
467 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
468 {
469     uint8_t a;
470     __m128i xmms;
471
472     a = src >> 24;
473
474     if (a == 0xff)
475     {
476         return src;
477     }
478     else if (src)
479     {
480         xmms = unpack_32_1x128 (src);
481         return pack_1x128_32 (
482             over_1x128 (xmms, expand_alpha_1x128 (xmms),
483                         unpack_32_1x128 (dst)));
484     }
485
486     return dst;
487 }
488
489 static force_inline uint32_t
490 combine1 (const uint32_t *ps, const uint32_t *pm)
491 {
492     uint32_t s = *ps;
493
494     if (pm)
495     {
496         __m128i ms, mm;
497
498         mm = unpack_32_1x128 (*pm);
499         mm = expand_alpha_1x128 (mm);
500
501         ms = unpack_32_1x128 (s);
502         ms = pix_multiply_1x128 (ms, mm);
503
504         s = pack_1x128_32 (ms);
505     }
506
507     return s;
508 }
509
510 static force_inline __m128i
511 combine4 (const __m128i *ps, const __m128i *pm)
512 {
513     __m128i xmm_src_lo, xmm_src_hi;
514     __m128i xmm_msk_lo, xmm_msk_hi;
515     __m128i s;
516
517     if (pm)
518     {
519         xmm_msk_lo = load_128_unaligned (pm);
520
521         if (is_transparent (xmm_msk_lo))
522             return _mm_setzero_si128 ();
523     }
524
525     s = load_128_unaligned (ps);
526
527     if (pm)
528     {
529         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
530         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
531
532         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
533
534         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
535                             &xmm_msk_lo, &xmm_msk_hi,
536                             &xmm_src_lo, &xmm_src_hi);
537
538         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
539     }
540
541     return s;
542 }
543
544 static force_inline void
545 core_combine_over_u_sse2_mask (uint32_t *         pd,
546                                const uint32_t*    ps,
547                                const uint32_t*    pm,
548                                int                w)
549 {
550     uint32_t s, d;
551
552     /* Align dst on a 16-byte boundary */
553     while (w && ((unsigned long)pd & 15))
554     {
555         d = *pd;
556         s = combine1 (ps, pm);
557
558         if (s)
559             *pd = core_combine_over_u_pixel_sse2 (s, d);
560         pd++;
561         ps++;
562         pm++;
563         w--;
564     }
565
566     while (w >= 4)
567     {
568         __m128i mask = load_128_unaligned ((__m128i *)pm);
569
570         if (!is_zero (mask))
571         {
572             __m128i src;
573             __m128i src_hi, src_lo;
574             __m128i mask_hi, mask_lo;
575             __m128i alpha_hi, alpha_lo;
576
577             src = load_128_unaligned ((__m128i *)ps);
578
579             if (is_opaque (_mm_and_si128 (src, mask)))
580             {
581                 save_128_aligned ((__m128i *)pd, src);
582             }
583             else
584             {
585                 __m128i dst = load_128_aligned ((__m128i *)pd);
586                 __m128i dst_hi, dst_lo;
587
588                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
589                 unpack_128_2x128 (src, &src_lo, &src_hi);
590
591                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
592                 pix_multiply_2x128 (&src_lo, &src_hi,
593                                     &mask_lo, &mask_hi,
594                                     &src_lo, &src_hi);
595
596                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
597
598                 expand_alpha_2x128 (src_lo, src_hi,
599                                     &alpha_lo, &alpha_hi);
600
601                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
602                             &dst_lo, &dst_hi);
603
604                 save_128_aligned (
605                     (__m128i *)pd,
606                     pack_2x128_128 (dst_lo, dst_hi));
607             }
608         }
609
610         pm += 4;
611         ps += 4;
612         pd += 4;
613         w -= 4;
614     }
615     while (w)
616     {
617         d = *pd;
618         s = combine1 (ps, pm);
619
620         if (s)
621             *pd = core_combine_over_u_pixel_sse2 (s, d);
622         pd++;
623         ps++;
624         pm++;
625
626         w--;
627     }
628 }
629
630 static force_inline void
631 core_combine_over_u_sse2_no_mask (uint32_t *      pd,
632                                   const uint32_t*    ps,
633                                   int                w)
634 {
635     uint32_t s, d;
636
637     /* Align dst on a 16-byte boundary */
638     while (w && ((unsigned long)pd & 15))
639     {
640         d = *pd;
641         s = *ps;
642
643         if (s)
644             *pd = core_combine_over_u_pixel_sse2 (s, d);
645         pd++;
646         ps++;
647         w--;
648     }
649
650     while (w >= 4)
651     {
652         __m128i src;
653         __m128i src_hi, src_lo, dst_hi, dst_lo;
654         __m128i alpha_hi, alpha_lo;
655
656         src = load_128_unaligned ((__m128i *)ps);
657
658         if (!is_zero (src))
659         {
660             if (is_opaque (src))
661             {
662                 save_128_aligned ((__m128i *)pd, src);
663             }
664             else
665             {
666                 __m128i dst = load_128_aligned ((__m128i *)pd);
667
668                 unpack_128_2x128 (src, &src_lo, &src_hi);
669                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
670
671                 expand_alpha_2x128 (src_lo, src_hi,
672                                     &alpha_lo, &alpha_hi);
673                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
674                             &dst_lo, &dst_hi);
675
676                 save_128_aligned (
677                     (__m128i *)pd,
678                     pack_2x128_128 (dst_lo, dst_hi));
679             }
680         }
681
682         ps += 4;
683         pd += 4;
684         w -= 4;
685     }
686     while (w)
687     {
688         d = *pd;
689         s = *ps;
690
691         if (s)
692             *pd = core_combine_over_u_pixel_sse2 (s, d);
693         pd++;
694         ps++;
695
696         w--;
697     }
698 }
699
700 static force_inline void
701 sse2_combine_over_u (pixman_implementation_t *imp,
702                      pixman_op_t              op,
703                      uint32_t *               pd,
704                      const uint32_t *         ps,
705                      const uint32_t *         pm,
706                      int                      w)
707 {
708     if (pm)
709         core_combine_over_u_sse2_mask (pd, ps, pm, w);
710     else
711         core_combine_over_u_sse2_no_mask (pd, ps, w);
712 }
713
714 static void
715 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
716                              pixman_op_t              op,
717                              uint32_t *               pd,
718                              const uint32_t *         ps,
719                              const uint32_t *         pm,
720                              int                      w)
721 {
722     uint32_t s, d;
723
724     __m128i xmm_dst_lo, xmm_dst_hi;
725     __m128i xmm_src_lo, xmm_src_hi;
726     __m128i xmm_alpha_lo, xmm_alpha_hi;
727
728     /* Align dst on a 16-byte boundary */
729     while (w &&
730            ((unsigned long)pd & 15))
731     {
732         d = *pd;
733         s = combine1 (ps, pm);
734
735         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
736         w--;
737         ps++;
738         if (pm)
739             pm++;
740     }
741
742     while (w >= 4)
743     {
744         /* I'm loading unaligned because I'm not sure
745          * about the address alignment.
746          */
747         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
748         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
749
750         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
751         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
752
753         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
754                             &xmm_alpha_lo, &xmm_alpha_hi);
755
756         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
757                     &xmm_alpha_lo, &xmm_alpha_hi,
758                     &xmm_src_lo, &xmm_src_hi);
759
760         /* rebuid the 4 pixel data and save*/
761         save_128_aligned ((__m128i*)pd,
762                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
763
764         w -= 4;
765         ps += 4;
766         pd += 4;
767
768         if (pm)
769             pm += 4;
770     }
771
772     while (w)
773     {
774         d = *pd;
775         s = combine1 (ps, pm);
776
777         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
778         ps++;
779         w--;
780         if (pm)
781             pm++;
782     }
783 }
784
785 static force_inline uint32_t
786 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
787 {
788     uint32_t maska = src >> 24;
789
790     if (maska == 0)
791     {
792         return 0;
793     }
794     else if (maska != 0xff)
795     {
796         return pack_1x128_32 (
797             pix_multiply_1x128 (unpack_32_1x128 (dst),
798                                 expand_alpha_1x128 (unpack_32_1x128 (src))));
799     }
800
801     return dst;
802 }
803
804 static void
805 sse2_combine_in_u (pixman_implementation_t *imp,
806                    pixman_op_t              op,
807                    uint32_t *               pd,
808                    const uint32_t *         ps,
809                    const uint32_t *         pm,
810                    int                      w)
811 {
812     uint32_t s, d;
813
814     __m128i xmm_src_lo, xmm_src_hi;
815     __m128i xmm_dst_lo, xmm_dst_hi;
816
817     while (w && ((unsigned long) pd & 15))
818     {
819         s = combine1 (ps, pm);
820         d = *pd;
821
822         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
823         w--;
824         ps++;
825         if (pm)
826             pm++;
827     }
828
829     while (w >= 4)
830     {
831         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
832         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
833
834         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
835         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
836
837         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
838         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
839                             &xmm_dst_lo, &xmm_dst_hi,
840                             &xmm_dst_lo, &xmm_dst_hi);
841
842         save_128_aligned ((__m128i*)pd,
843                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
844
845         ps += 4;
846         pd += 4;
847         w -= 4;
848         if (pm)
849             pm += 4;
850     }
851
852     while (w)
853     {
854         s = combine1 (ps, pm);
855         d = *pd;
856
857         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
858         w--;
859         ps++;
860         if (pm)
861             pm++;
862     }
863 }
864
865 static void
866 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
867                            pixman_op_t              op,
868                            uint32_t *               pd,
869                            const uint32_t *         ps,
870                            const uint32_t *         pm,
871                            int                      w)
872 {
873     uint32_t s, d;
874
875     __m128i xmm_src_lo, xmm_src_hi;
876     __m128i xmm_dst_lo, xmm_dst_hi;
877
878     while (w && ((unsigned long) pd & 15))
879     {
880         s = combine1 (ps, pm);
881         d = *pd;
882
883         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
884         ps++;
885         w--;
886         if (pm)
887             pm++;
888     }
889
890     while (w >= 4)
891     {
892         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
893         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
894
895         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
896         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
897
898         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
899         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
900                             &xmm_src_lo, &xmm_src_hi,
901                             &xmm_dst_lo, &xmm_dst_hi);
902
903         save_128_aligned (
904             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
905
906         ps += 4;
907         pd += 4;
908         w -= 4;
909         if (pm)
910             pm += 4;
911     }
912
913     while (w)
914     {
915         s = combine1 (ps, pm);
916         d = *pd;
917
918         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
919         w--;
920         ps++;
921         if (pm)
922             pm++;
923     }
924 }
925
926 static void
927 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
928                             pixman_op_t              op,
929                             uint32_t *               pd,
930                             const uint32_t *         ps,
931                             const uint32_t *         pm,
932                             int                      w)
933 {
934     while (w && ((unsigned long) pd & 15))
935     {
936         uint32_t s = combine1 (ps, pm);
937         uint32_t d = *pd;
938
939         *pd++ = pack_1x128_32 (
940             pix_multiply_1x128 (
941                 unpack_32_1x128 (d), negate_1x128 (
942                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
943
944         if (pm)
945             pm++;
946         ps++;
947         w--;
948     }
949
950     while (w >= 4)
951     {
952         __m128i xmm_src_lo, xmm_src_hi;
953         __m128i xmm_dst_lo, xmm_dst_hi;
954
955         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
956         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
957
958         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
959         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
960
961         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
962         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
963
964         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
965                             &xmm_src_lo, &xmm_src_hi,
966                             &xmm_dst_lo, &xmm_dst_hi);
967
968         save_128_aligned (
969             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
970
971         ps += 4;
972         pd += 4;
973         if (pm)
974             pm += 4;
975
976         w -= 4;
977     }
978
979     while (w)
980     {
981         uint32_t s = combine1 (ps, pm);
982         uint32_t d = *pd;
983
984         *pd++ = pack_1x128_32 (
985             pix_multiply_1x128 (
986                 unpack_32_1x128 (d), negate_1x128 (
987                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
988         ps++;
989         if (pm)
990             pm++;
991         w--;
992     }
993 }
994
995 static void
996 sse2_combine_out_u (pixman_implementation_t *imp,
997                     pixman_op_t              op,
998                     uint32_t *               pd,
999                     const uint32_t *         ps,
1000                     const uint32_t *         pm,
1001                     int                      w)
1002 {
1003     while (w && ((unsigned long) pd & 15))
1004     {
1005         uint32_t s = combine1 (ps, pm);
1006         uint32_t d = *pd;
1007
1008         *pd++ = pack_1x128_32 (
1009             pix_multiply_1x128 (
1010                 unpack_32_1x128 (s), negate_1x128 (
1011                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1012         w--;
1013         ps++;
1014         if (pm)
1015             pm++;
1016     }
1017
1018     while (w >= 4)
1019     {
1020         __m128i xmm_src_lo, xmm_src_hi;
1021         __m128i xmm_dst_lo, xmm_dst_hi;
1022
1023         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1024         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1025
1026         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1027         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1028
1029         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1030         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1031
1032         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1033                             &xmm_dst_lo, &xmm_dst_hi,
1034                             &xmm_dst_lo, &xmm_dst_hi);
1035
1036         save_128_aligned (
1037             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1038
1039         ps += 4;
1040         pd += 4;
1041         w -= 4;
1042         if (pm)
1043             pm += 4;
1044     }
1045
1046     while (w)
1047     {
1048         uint32_t s = combine1 (ps, pm);
1049         uint32_t d = *pd;
1050
1051         *pd++ = pack_1x128_32 (
1052             pix_multiply_1x128 (
1053                 unpack_32_1x128 (s), negate_1x128 (
1054                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1055         w--;
1056         ps++;
1057         if (pm)
1058             pm++;
1059     }
1060 }
1061
1062 static force_inline uint32_t
1063 core_combine_atop_u_pixel_sse2 (uint32_t src,
1064                                 uint32_t dst)
1065 {
1066     __m128i s = unpack_32_1x128 (src);
1067     __m128i d = unpack_32_1x128 (dst);
1068
1069     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1070     __m128i da = expand_alpha_1x128 (d);
1071
1072     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1073 }
1074
1075 static void
1076 sse2_combine_atop_u (pixman_implementation_t *imp,
1077                      pixman_op_t              op,
1078                      uint32_t *               pd,
1079                      const uint32_t *         ps,
1080                      const uint32_t *         pm,
1081                      int                      w)
1082 {
1083     uint32_t s, d;
1084
1085     __m128i xmm_src_lo, xmm_src_hi;
1086     __m128i xmm_dst_lo, xmm_dst_hi;
1087     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1088     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1089
1090     while (w && ((unsigned long) pd & 15))
1091     {
1092         s = combine1 (ps, pm);
1093         d = *pd;
1094
1095         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1096         w--;
1097         ps++;
1098         if (pm)
1099             pm++;
1100     }
1101
1102     while (w >= 4)
1103     {
1104         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1105         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1106
1107         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1108         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1109
1110         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1111                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1112         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1113                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1114
1115         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1116                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1117
1118         pix_add_multiply_2x128 (
1119             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1120             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1121             &xmm_dst_lo, &xmm_dst_hi);
1122
1123         save_128_aligned (
1124             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1125
1126         ps += 4;
1127         pd += 4;
1128         w -= 4;
1129         if (pm)
1130             pm += 4;
1131     }
1132
1133     while (w)
1134     {
1135         s = combine1 (ps, pm);
1136         d = *pd;
1137
1138         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1139         w--;
1140         ps++;
1141         if (pm)
1142             pm++;
1143     }
1144 }
1145
1146 static force_inline uint32_t
1147 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1148                                         uint32_t dst)
1149 {
1150     __m128i s = unpack_32_1x128 (src);
1151     __m128i d = unpack_32_1x128 (dst);
1152
1153     __m128i sa = expand_alpha_1x128 (s);
1154     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1155
1156     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1157 }
1158
1159 static void
1160 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1161                              pixman_op_t              op,
1162                              uint32_t *               pd,
1163                              const uint32_t *         ps,
1164                              const uint32_t *         pm,
1165                              int                      w)
1166 {
1167     uint32_t s, d;
1168
1169     __m128i xmm_src_lo, xmm_src_hi;
1170     __m128i xmm_dst_lo, xmm_dst_hi;
1171     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1172     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1173
1174     while (w && ((unsigned long) pd & 15))
1175     {
1176         s = combine1 (ps, pm);
1177         d = *pd;
1178
1179         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1180         ps++;
1181         w--;
1182         if (pm)
1183             pm++;
1184     }
1185
1186     while (w >= 4)
1187     {
1188         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1189         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1190
1191         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1192         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1193
1194         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1195                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1196         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1197                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1198
1199         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1200                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1201
1202         pix_add_multiply_2x128 (
1203             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1204             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1205             &xmm_dst_lo, &xmm_dst_hi);
1206
1207         save_128_aligned (
1208             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1209
1210         ps += 4;
1211         pd += 4;
1212         w -= 4;
1213         if (pm)
1214             pm += 4;
1215     }
1216
1217     while (w)
1218     {
1219         s = combine1 (ps, pm);
1220         d = *pd;
1221
1222         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1223         ps++;
1224         w--;
1225         if (pm)
1226             pm++;
1227     }
1228 }
1229
1230 static force_inline uint32_t
1231 core_combine_xor_u_pixel_sse2 (uint32_t src,
1232                                uint32_t dst)
1233 {
1234     __m128i s = unpack_32_1x128 (src);
1235     __m128i d = unpack_32_1x128 (dst);
1236
1237     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1238     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1239
1240     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1241 }
1242
1243 static void
1244 sse2_combine_xor_u (pixman_implementation_t *imp,
1245                     pixman_op_t              op,
1246                     uint32_t *               dst,
1247                     const uint32_t *         src,
1248                     const uint32_t *         mask,
1249                     int                      width)
1250 {
1251     int w = width;
1252     uint32_t s, d;
1253     uint32_t* pd = dst;
1254     const uint32_t* ps = src;
1255     const uint32_t* pm = mask;
1256
1257     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1258     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1259     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1260     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1261
1262     while (w && ((unsigned long) pd & 15))
1263     {
1264         s = combine1 (ps, pm);
1265         d = *pd;
1266
1267         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1268         w--;
1269         ps++;
1270         if (pm)
1271             pm++;
1272     }
1273
1274     while (w >= 4)
1275     {
1276         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1277         xmm_dst = load_128_aligned ((__m128i*) pd);
1278
1279         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1280         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1281
1282         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1283                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1284         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1285                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1286
1287         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1288                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1289         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1290                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1291
1292         pix_add_multiply_2x128 (
1293             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1294             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1295             &xmm_dst_lo, &xmm_dst_hi);
1296
1297         save_128_aligned (
1298             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1299
1300         ps += 4;
1301         pd += 4;
1302         w -= 4;
1303         if (pm)
1304             pm += 4;
1305     }
1306
1307     while (w)
1308     {
1309         s = combine1 (ps, pm);
1310         d = *pd;
1311
1312         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1313         w--;
1314         ps++;
1315         if (pm)
1316             pm++;
1317     }
1318 }
1319
1320 static force_inline void
1321 sse2_combine_add_u (pixman_implementation_t *imp,
1322                     pixman_op_t              op,
1323                     uint32_t *               dst,
1324                     const uint32_t *         src,
1325                     const uint32_t *         mask,
1326                     int                      width)
1327 {
1328     int w = width;
1329     uint32_t s, d;
1330     uint32_t* pd = dst;
1331     const uint32_t* ps = src;
1332     const uint32_t* pm = mask;
1333
1334     while (w && (unsigned long)pd & 15)
1335     {
1336         s = combine1 (ps, pm);
1337         d = *pd;
1338
1339         ps++;
1340         if (pm)
1341             pm++;
1342         *pd++ = _mm_cvtsi128_si32 (
1343             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1344         w--;
1345     }
1346
1347     while (w >= 4)
1348     {
1349         __m128i s;
1350
1351         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1352
1353         save_128_aligned (
1354             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1355
1356         pd += 4;
1357         ps += 4;
1358         if (pm)
1359             pm += 4;
1360         w -= 4;
1361     }
1362
1363     while (w--)
1364     {
1365         s = combine1 (ps, pm);
1366         d = *pd;
1367
1368         ps++;
1369         *pd++ = _mm_cvtsi128_si32 (
1370             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1371         if (pm)
1372             pm++;
1373     }
1374 }
1375
1376 static force_inline uint32_t
1377 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1378                                     uint32_t dst)
1379 {
1380     __m128i ms = unpack_32_1x128 (src);
1381     __m128i md = unpack_32_1x128 (dst);
1382     uint32_t sa = src >> 24;
1383     uint32_t da = ~dst >> 24;
1384
1385     if (sa > da)
1386     {
1387         ms = pix_multiply_1x128 (
1388             ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1389     }
1390
1391     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1392 }
1393
1394 static void
1395 sse2_combine_saturate_u (pixman_implementation_t *imp,
1396                          pixman_op_t              op,
1397                          uint32_t *               pd,
1398                          const uint32_t *         ps,
1399                          const uint32_t *         pm,
1400                          int                      w)
1401 {
1402     uint32_t s, d;
1403
1404     uint32_t pack_cmp;
1405     __m128i xmm_src, xmm_dst;
1406
1407     while (w && (unsigned long)pd & 15)
1408     {
1409         s = combine1 (ps, pm);
1410         d = *pd;
1411
1412         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1413         w--;
1414         ps++;
1415         if (pm)
1416             pm++;
1417     }
1418
1419     while (w >= 4)
1420     {
1421         xmm_dst = load_128_aligned  ((__m128i*)pd);
1422         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1423
1424         pack_cmp = _mm_movemask_epi8 (
1425             _mm_cmpgt_epi32 (
1426                 _mm_srli_epi32 (xmm_src, 24),
1427                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1428
1429         /* if some alpha src is grater than respective ~alpha dst */
1430         if (pack_cmp)
1431         {
1432             s = combine1 (ps++, pm);
1433             d = *pd;
1434             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1435             if (pm)
1436                 pm++;
1437
1438             s = combine1 (ps++, pm);
1439             d = *pd;
1440             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1441             if (pm)
1442                 pm++;
1443
1444             s = combine1 (ps++, pm);
1445             d = *pd;
1446             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1447             if (pm)
1448                 pm++;
1449
1450             s = combine1 (ps++, pm);
1451             d = *pd;
1452             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1453             if (pm)
1454                 pm++;
1455         }
1456         else
1457         {
1458             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1459
1460             pd += 4;
1461             ps += 4;
1462             if (pm)
1463                 pm += 4;
1464         }
1465
1466         w -= 4;
1467     }
1468
1469     while (w--)
1470     {
1471         s = combine1 (ps, pm);
1472         d = *pd;
1473
1474         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1475         ps++;
1476         if (pm)
1477             pm++;
1478     }
1479 }
1480
1481 static void
1482 sse2_combine_src_ca (pixman_implementation_t *imp,
1483                      pixman_op_t              op,
1484                      uint32_t *               pd,
1485                      const uint32_t *         ps,
1486                      const uint32_t *         pm,
1487                      int                      w)
1488 {
1489     uint32_t s, m;
1490
1491     __m128i xmm_src_lo, xmm_src_hi;
1492     __m128i xmm_mask_lo, xmm_mask_hi;
1493     __m128i xmm_dst_lo, xmm_dst_hi;
1494
1495     while (w && (unsigned long)pd & 15)
1496     {
1497         s = *ps++;
1498         m = *pm++;
1499         *pd++ = pack_1x128_32 (
1500             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1501         w--;
1502     }
1503
1504     while (w >= 4)
1505     {
1506         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1507         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1508
1509         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1510         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1511
1512         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1513                             &xmm_mask_lo, &xmm_mask_hi,
1514                             &xmm_dst_lo, &xmm_dst_hi);
1515
1516         save_128_aligned (
1517             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1518
1519         ps += 4;
1520         pd += 4;
1521         pm += 4;
1522         w -= 4;
1523     }
1524
1525     while (w)
1526     {
1527         s = *ps++;
1528         m = *pm++;
1529         *pd++ = pack_1x128_32 (
1530             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1531         w--;
1532     }
1533 }
1534
1535 static force_inline uint32_t
1536 core_combine_over_ca_pixel_sse2 (uint32_t src,
1537                                  uint32_t mask,
1538                                  uint32_t dst)
1539 {
1540     __m128i s = unpack_32_1x128 (src);
1541     __m128i expAlpha = expand_alpha_1x128 (s);
1542     __m128i unpk_mask = unpack_32_1x128 (mask);
1543     __m128i unpk_dst  = unpack_32_1x128 (dst);
1544
1545     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1546 }
1547
1548 static void
1549 sse2_combine_over_ca (pixman_implementation_t *imp,
1550                       pixman_op_t              op,
1551                       uint32_t *               pd,
1552                       const uint32_t *         ps,
1553                       const uint32_t *         pm,
1554                       int                      w)
1555 {
1556     uint32_t s, m, d;
1557
1558     __m128i xmm_alpha_lo, xmm_alpha_hi;
1559     __m128i xmm_src_lo, xmm_src_hi;
1560     __m128i xmm_dst_lo, xmm_dst_hi;
1561     __m128i xmm_mask_lo, xmm_mask_hi;
1562
1563     while (w && (unsigned long)pd & 15)
1564     {
1565         s = *ps++;
1566         m = *pm++;
1567         d = *pd;
1568
1569         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1570         w--;
1571     }
1572
1573     while (w >= 4)
1574     {
1575         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1576         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1577         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1578
1579         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1580         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1581         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1582
1583         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1584                             &xmm_alpha_lo, &xmm_alpha_hi);
1585
1586         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1587                        &xmm_alpha_lo, &xmm_alpha_hi,
1588                        &xmm_mask_lo, &xmm_mask_hi,
1589                        &xmm_dst_lo, &xmm_dst_hi);
1590
1591         save_128_aligned (
1592             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1593
1594         ps += 4;
1595         pd += 4;
1596         pm += 4;
1597         w -= 4;
1598     }
1599
1600     while (w)
1601     {
1602         s = *ps++;
1603         m = *pm++;
1604         d = *pd;
1605
1606         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1607         w--;
1608     }
1609 }
1610
1611 static force_inline uint32_t
1612 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1613                                          uint32_t mask,
1614                                          uint32_t dst)
1615 {
1616     __m128i d = unpack_32_1x128 (dst);
1617
1618     return pack_1x128_32 (
1619         over_1x128 (d, expand_alpha_1x128 (d),
1620                     pix_multiply_1x128 (unpack_32_1x128 (src),
1621                                         unpack_32_1x128 (mask))));
1622 }
1623
1624 static void
1625 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1626                               pixman_op_t              op,
1627                               uint32_t *               pd,
1628                               const uint32_t *         ps,
1629                               const uint32_t *         pm,
1630                               int                      w)
1631 {
1632     uint32_t s, m, d;
1633
1634     __m128i xmm_alpha_lo, xmm_alpha_hi;
1635     __m128i xmm_src_lo, xmm_src_hi;
1636     __m128i xmm_dst_lo, xmm_dst_hi;
1637     __m128i xmm_mask_lo, xmm_mask_hi;
1638
1639     while (w && (unsigned long)pd & 15)
1640     {
1641         s = *ps++;
1642         m = *pm++;
1643         d = *pd;
1644
1645         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1646         w--;
1647     }
1648
1649     while (w >= 4)
1650     {
1651         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1652         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1653         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1654
1655         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1656         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1657         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1658
1659         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1660                             &xmm_alpha_lo, &xmm_alpha_hi);
1661         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1662                             &xmm_mask_lo, &xmm_mask_hi,
1663                             &xmm_mask_lo, &xmm_mask_hi);
1664
1665         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1666                     &xmm_alpha_lo, &xmm_alpha_hi,
1667                     &xmm_mask_lo, &xmm_mask_hi);
1668
1669         save_128_aligned (
1670             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1671
1672         ps += 4;
1673         pd += 4;
1674         pm += 4;
1675         w -= 4;
1676     }
1677
1678     while (w)
1679     {
1680         s = *ps++;
1681         m = *pm++;
1682         d = *pd;
1683
1684         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1685         w--;
1686     }
1687 }
1688
1689 static void
1690 sse2_combine_in_ca (pixman_implementation_t *imp,
1691                     pixman_op_t              op,
1692                     uint32_t *               pd,
1693                     const uint32_t *         ps,
1694                     const uint32_t *         pm,
1695                     int                      w)
1696 {
1697     uint32_t s, m, d;
1698
1699     __m128i xmm_alpha_lo, xmm_alpha_hi;
1700     __m128i xmm_src_lo, xmm_src_hi;
1701     __m128i xmm_dst_lo, xmm_dst_hi;
1702     __m128i xmm_mask_lo, xmm_mask_hi;
1703
1704     while (w && (unsigned long)pd & 15)
1705     {
1706         s = *ps++;
1707         m = *pm++;
1708         d = *pd;
1709
1710         *pd++ = pack_1x128_32 (
1711             pix_multiply_1x128 (
1712                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1713                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1714
1715         w--;
1716     }
1717
1718     while (w >= 4)
1719     {
1720         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1721         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1722         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1723
1724         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1725         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1726         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1727
1728         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1729                             &xmm_alpha_lo, &xmm_alpha_hi);
1730
1731         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1732                             &xmm_mask_lo, &xmm_mask_hi,
1733                             &xmm_dst_lo, &xmm_dst_hi);
1734
1735         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1736                             &xmm_alpha_lo, &xmm_alpha_hi,
1737                             &xmm_dst_lo, &xmm_dst_hi);
1738
1739         save_128_aligned (
1740             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1741
1742         ps += 4;
1743         pd += 4;
1744         pm += 4;
1745         w -= 4;
1746     }
1747
1748     while (w)
1749     {
1750         s = *ps++;
1751         m = *pm++;
1752         d = *pd;
1753
1754         *pd++ = pack_1x128_32 (
1755             pix_multiply_1x128 (
1756                 pix_multiply_1x128 (
1757                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1758                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1759
1760         w--;
1761     }
1762 }
1763
1764 static void
1765 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1766                             pixman_op_t              op,
1767                             uint32_t *               pd,
1768                             const uint32_t *         ps,
1769                             const uint32_t *         pm,
1770                             int                      w)
1771 {
1772     uint32_t s, m, d;
1773
1774     __m128i xmm_alpha_lo, xmm_alpha_hi;
1775     __m128i xmm_src_lo, xmm_src_hi;
1776     __m128i xmm_dst_lo, xmm_dst_hi;
1777     __m128i xmm_mask_lo, xmm_mask_hi;
1778
1779     while (w && (unsigned long)pd & 15)
1780     {
1781         s = *ps++;
1782         m = *pm++;
1783         d = *pd;
1784
1785         *pd++ = pack_1x128_32 (
1786             pix_multiply_1x128 (
1787                 unpack_32_1x128 (d),
1788                 pix_multiply_1x128 (unpack_32_1x128 (m),
1789                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1790         w--;
1791     }
1792
1793     while (w >= 4)
1794     {
1795         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1796         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1797         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1798
1799         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1800         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1801         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1802
1803         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1804                             &xmm_alpha_lo, &xmm_alpha_hi);
1805         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1806                             &xmm_alpha_lo, &xmm_alpha_hi,
1807                             &xmm_alpha_lo, &xmm_alpha_hi);
1808
1809         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1810                             &xmm_alpha_lo, &xmm_alpha_hi,
1811                             &xmm_dst_lo, &xmm_dst_hi);
1812
1813         save_128_aligned (
1814             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1815
1816         ps += 4;
1817         pd += 4;
1818         pm += 4;
1819         w -= 4;
1820     }
1821
1822     while (w)
1823     {
1824         s = *ps++;
1825         m = *pm++;
1826         d = *pd;
1827
1828         *pd++ = pack_1x128_32 (
1829             pix_multiply_1x128 (
1830                 unpack_32_1x128 (d),
1831                 pix_multiply_1x128 (unpack_32_1x128 (m),
1832                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1833         w--;
1834     }
1835 }
1836
1837 static void
1838 sse2_combine_out_ca (pixman_implementation_t *imp,
1839                      pixman_op_t              op,
1840                      uint32_t *               pd,
1841                      const uint32_t *         ps,
1842                      const uint32_t *         pm,
1843                      int                      w)
1844 {
1845     uint32_t s, m, d;
1846
1847     __m128i xmm_alpha_lo, xmm_alpha_hi;
1848     __m128i xmm_src_lo, xmm_src_hi;
1849     __m128i xmm_dst_lo, xmm_dst_hi;
1850     __m128i xmm_mask_lo, xmm_mask_hi;
1851
1852     while (w && (unsigned long)pd & 15)
1853     {
1854         s = *ps++;
1855         m = *pm++;
1856         d = *pd;
1857
1858         *pd++ = pack_1x128_32 (
1859             pix_multiply_1x128 (
1860                 pix_multiply_1x128 (
1861                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1862                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1863         w--;
1864     }
1865
1866     while (w >= 4)
1867     {
1868         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1869         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1870         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1871
1872         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1873         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1874         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1875
1876         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1877                             &xmm_alpha_lo, &xmm_alpha_hi);
1878         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1879                       &xmm_alpha_lo, &xmm_alpha_hi);
1880
1881         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1882                             &xmm_mask_lo, &xmm_mask_hi,
1883                             &xmm_dst_lo, &xmm_dst_hi);
1884         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1885                             &xmm_alpha_lo, &xmm_alpha_hi,
1886                             &xmm_dst_lo, &xmm_dst_hi);
1887
1888         save_128_aligned (
1889             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1890
1891         ps += 4;
1892         pd += 4;
1893         pm += 4;
1894         w -= 4;
1895     }
1896
1897     while (w)
1898     {
1899         s = *ps++;
1900         m = *pm++;
1901         d = *pd;
1902
1903         *pd++ = pack_1x128_32 (
1904             pix_multiply_1x128 (
1905                 pix_multiply_1x128 (
1906                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1907                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1908
1909         w--;
1910     }
1911 }
1912
1913 static void
1914 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1915                              pixman_op_t              op,
1916                              uint32_t *               pd,
1917                              const uint32_t *         ps,
1918                              const uint32_t *         pm,
1919                              int                      w)
1920 {
1921     uint32_t s, m, d;
1922
1923     __m128i xmm_alpha_lo, xmm_alpha_hi;
1924     __m128i xmm_src_lo, xmm_src_hi;
1925     __m128i xmm_dst_lo, xmm_dst_hi;
1926     __m128i xmm_mask_lo, xmm_mask_hi;
1927
1928     while (w && (unsigned long)pd & 15)
1929     {
1930         s = *ps++;
1931         m = *pm++;
1932         d = *pd;
1933
1934         *pd++ = pack_1x128_32 (
1935             pix_multiply_1x128 (
1936                 unpack_32_1x128 (d),
1937                 negate_1x128 (pix_multiply_1x128 (
1938                                  unpack_32_1x128 (m),
1939                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1940         w--;
1941     }
1942
1943     while (w >= 4)
1944     {
1945         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1946         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1947         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1948
1949         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1950         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1951         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1952
1953         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1954                             &xmm_alpha_lo, &xmm_alpha_hi);
1955
1956         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1957                             &xmm_alpha_lo, &xmm_alpha_hi,
1958                             &xmm_mask_lo, &xmm_mask_hi);
1959
1960         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1961                       &xmm_mask_lo, &xmm_mask_hi);
1962
1963         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1964                             &xmm_mask_lo, &xmm_mask_hi,
1965                             &xmm_dst_lo, &xmm_dst_hi);
1966
1967         save_128_aligned (
1968             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1969
1970         ps += 4;
1971         pd += 4;
1972         pm += 4;
1973         w -= 4;
1974     }
1975
1976     while (w)
1977     {
1978         s = *ps++;
1979         m = *pm++;
1980         d = *pd;
1981
1982         *pd++ = pack_1x128_32 (
1983             pix_multiply_1x128 (
1984                 unpack_32_1x128 (d),
1985                 negate_1x128 (pix_multiply_1x128 (
1986                                  unpack_32_1x128 (m),
1987                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1988         w--;
1989     }
1990 }
1991
1992 static force_inline uint32_t
1993 core_combine_atop_ca_pixel_sse2 (uint32_t src,
1994                                  uint32_t mask,
1995                                  uint32_t dst)
1996 {
1997     __m128i m = unpack_32_1x128 (mask);
1998     __m128i s = unpack_32_1x128 (src);
1999     __m128i d = unpack_32_1x128 (dst);
2000     __m128i sa = expand_alpha_1x128 (s);
2001     __m128i da = expand_alpha_1x128 (d);
2002
2003     s = pix_multiply_1x128 (s, m);
2004     m = negate_1x128 (pix_multiply_1x128 (m, sa));
2005
2006     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2007 }
2008
2009 static void
2010 sse2_combine_atop_ca (pixman_implementation_t *imp,
2011                       pixman_op_t              op,
2012                       uint32_t *               pd,
2013                       const uint32_t *         ps,
2014                       const uint32_t *         pm,
2015                       int                      w)
2016 {
2017     uint32_t s, m, d;
2018
2019     __m128i xmm_src_lo, xmm_src_hi;
2020     __m128i xmm_dst_lo, xmm_dst_hi;
2021     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2022     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2023     __m128i xmm_mask_lo, xmm_mask_hi;
2024
2025     while (w && (unsigned long)pd & 15)
2026     {
2027         s = *ps++;
2028         m = *pm++;
2029         d = *pd;
2030
2031         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2032         w--;
2033     }
2034
2035     while (w >= 4)
2036     {
2037         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2038         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2039         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2040
2041         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2042         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2043         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2044
2045         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2046                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2047         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2048                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2049
2050         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2051                             &xmm_mask_lo, &xmm_mask_hi,
2052                             &xmm_src_lo, &xmm_src_hi);
2053         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2054                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2055                             &xmm_mask_lo, &xmm_mask_hi);
2056
2057         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2058
2059         pix_add_multiply_2x128 (
2060             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2061             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2062             &xmm_dst_lo, &xmm_dst_hi);
2063
2064         save_128_aligned (
2065             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2066
2067         ps += 4;
2068         pd += 4;
2069         pm += 4;
2070         w -= 4;
2071     }
2072
2073     while (w)
2074     {
2075         s = *ps++;
2076         m = *pm++;
2077         d = *pd;
2078
2079         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2080         w--;
2081     }
2082 }
2083
2084 static force_inline uint32_t
2085 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2086                                          uint32_t mask,
2087                                          uint32_t dst)
2088 {
2089     __m128i m = unpack_32_1x128 (mask);
2090     __m128i s = unpack_32_1x128 (src);
2091     __m128i d = unpack_32_1x128 (dst);
2092
2093     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2094     __m128i sa = expand_alpha_1x128 (s);
2095
2096     s = pix_multiply_1x128 (s, m);
2097     m = pix_multiply_1x128 (m, sa);
2098
2099     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2100 }
2101
2102 static void
2103 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2104                               pixman_op_t              op,
2105                               uint32_t *               pd,
2106                               const uint32_t *         ps,
2107                               const uint32_t *         pm,
2108                               int                      w)
2109 {
2110     uint32_t s, m, d;
2111
2112     __m128i xmm_src_lo, xmm_src_hi;
2113     __m128i xmm_dst_lo, xmm_dst_hi;
2114     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2115     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2116     __m128i xmm_mask_lo, xmm_mask_hi;
2117
2118     while (w && (unsigned long)pd & 15)
2119     {
2120         s = *ps++;
2121         m = *pm++;
2122         d = *pd;
2123
2124         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2125         w--;
2126     }
2127
2128     while (w >= 4)
2129     {
2130         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2131         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2132         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2133
2134         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2135         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2136         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2137
2138         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2139                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2140         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2141                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2142
2143         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2144                             &xmm_mask_lo, &xmm_mask_hi,
2145                             &xmm_src_lo, &xmm_src_hi);
2146         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2147                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2148                             &xmm_mask_lo, &xmm_mask_hi);
2149
2150         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2151                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2152
2153         pix_add_multiply_2x128 (
2154             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2155             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2156             &xmm_dst_lo, &xmm_dst_hi);
2157
2158         save_128_aligned (
2159             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2160
2161         ps += 4;
2162         pd += 4;
2163         pm += 4;
2164         w -= 4;
2165     }
2166
2167     while (w)
2168     {
2169         s = *ps++;
2170         m = *pm++;
2171         d = *pd;
2172
2173         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2174         w--;
2175     }
2176 }
2177
2178 static force_inline uint32_t
2179 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2180                                 uint32_t mask,
2181                                 uint32_t dst)
2182 {
2183     __m128i a = unpack_32_1x128 (mask);
2184     __m128i s = unpack_32_1x128 (src);
2185     __m128i d = unpack_32_1x128 (dst);
2186
2187     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2188                                        a, expand_alpha_1x128 (s)));
2189     __m128i dest      = pix_multiply_1x128 (s, a);
2190     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2191
2192     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2193                                                 &alpha_dst,
2194                                                 &dest,
2195                                                 &alpha_src));
2196 }
2197
2198 static void
2199 sse2_combine_xor_ca (pixman_implementation_t *imp,
2200                      pixman_op_t              op,
2201                      uint32_t *               pd,
2202                      const uint32_t *         ps,
2203                      const uint32_t *         pm,
2204                      int                      w)
2205 {
2206     uint32_t s, m, d;
2207
2208     __m128i xmm_src_lo, xmm_src_hi;
2209     __m128i xmm_dst_lo, xmm_dst_hi;
2210     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2211     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2212     __m128i xmm_mask_lo, xmm_mask_hi;
2213
2214     while (w && (unsigned long)pd & 15)
2215     {
2216         s = *ps++;
2217         m = *pm++;
2218         d = *pd;
2219
2220         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2221         w--;
2222     }
2223
2224     while (w >= 4)
2225     {
2226         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2227         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2228         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2229
2230         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2231         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2232         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2233
2234         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2235                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2236         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2237                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2238
2239         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2240                             &xmm_mask_lo, &xmm_mask_hi,
2241                             &xmm_src_lo, &xmm_src_hi);
2242         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2243                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2244                             &xmm_mask_lo, &xmm_mask_hi);
2245
2246         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2247                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2248         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2249                       &xmm_mask_lo, &xmm_mask_hi);
2250
2251         pix_add_multiply_2x128 (
2252             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2253             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2254             &xmm_dst_lo, &xmm_dst_hi);
2255
2256         save_128_aligned (
2257             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2258
2259         ps += 4;
2260         pd += 4;
2261         pm += 4;
2262         w -= 4;
2263     }
2264
2265     while (w)
2266     {
2267         s = *ps++;
2268         m = *pm++;
2269         d = *pd;
2270
2271         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2272         w--;
2273     }
2274 }
2275
2276 static void
2277 sse2_combine_add_ca (pixman_implementation_t *imp,
2278                      pixman_op_t              op,
2279                      uint32_t *               pd,
2280                      const uint32_t *         ps,
2281                      const uint32_t *         pm,
2282                      int                      w)
2283 {
2284     uint32_t s, m, d;
2285
2286     __m128i xmm_src_lo, xmm_src_hi;
2287     __m128i xmm_dst_lo, xmm_dst_hi;
2288     __m128i xmm_mask_lo, xmm_mask_hi;
2289
2290     while (w && (unsigned long)pd & 15)
2291     {
2292         s = *ps++;
2293         m = *pm++;
2294         d = *pd;
2295
2296         *pd++ = pack_1x128_32 (
2297             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2298                                                unpack_32_1x128 (m)),
2299                            unpack_32_1x128 (d)));
2300         w--;
2301     }
2302
2303     while (w >= 4)
2304     {
2305         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2306         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2307         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2308
2309         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2310         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2311         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2312
2313         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2314                             &xmm_mask_lo, &xmm_mask_hi,
2315                             &xmm_src_lo, &xmm_src_hi);
2316
2317         save_128_aligned (
2318             (__m128i*)pd, pack_2x128_128 (
2319                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2320                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2321
2322         ps += 4;
2323         pd += 4;
2324         pm += 4;
2325         w -= 4;
2326     }
2327
2328     while (w)
2329     {
2330         s = *ps++;
2331         m = *pm++;
2332         d = *pd;
2333
2334         *pd++ = pack_1x128_32 (
2335             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2336                                                unpack_32_1x128 (m)),
2337                            unpack_32_1x128 (d)));
2338         w--;
2339     }
2340 }
2341
2342 static force_inline __m128i
2343 create_mask_16_128 (uint16_t mask)
2344 {
2345     return _mm_set1_epi16 (mask);
2346 }
2347
2348 /* Work around a code generation bug in Sun Studio 12. */
2349 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2350 # define create_mask_2x32_128(mask0, mask1)                             \
2351     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2352 #else
2353 static force_inline __m128i
2354 create_mask_2x32_128 (uint32_t mask0,
2355                       uint32_t mask1)
2356 {
2357     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2358 }
2359 #endif
2360
2361 static void
2362 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2363                             pixman_composite_info_t *info)
2364 {
2365     PIXMAN_COMPOSITE_ARGS (info);
2366     uint32_t src;
2367     uint32_t    *dst_line, *dst, d;
2368     int32_t w;
2369     int dst_stride;
2370     __m128i xmm_src, xmm_alpha;
2371     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2372
2373     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2374
2375     if (src == 0)
2376         return;
2377
2378     PIXMAN_IMAGE_GET_LINE (
2379         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2380
2381     xmm_src = expand_pixel_32_1x128 (src);
2382     xmm_alpha = expand_alpha_1x128 (xmm_src);
2383
2384     while (height--)
2385     {
2386         dst = dst_line;
2387
2388         dst_line += dst_stride;
2389         w = width;
2390
2391         while (w && (unsigned long)dst & 15)
2392         {
2393             d = *dst;
2394             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2395                                                 xmm_alpha,
2396                                                 unpack_32_1x128 (d)));
2397             w--;
2398         }
2399
2400         while (w >= 4)
2401         {
2402             xmm_dst = load_128_aligned ((__m128i*)dst);
2403
2404             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2405
2406             over_2x128 (&xmm_src, &xmm_src,
2407                         &xmm_alpha, &xmm_alpha,
2408                         &xmm_dst_lo, &xmm_dst_hi);
2409
2410             /* rebuid the 4 pixel data and save*/
2411             save_128_aligned (
2412                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2413
2414             w -= 4;
2415             dst += 4;
2416         }
2417
2418         while (w)
2419         {
2420             d = *dst;
2421             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2422                                                 xmm_alpha,
2423                                                 unpack_32_1x128 (d)));
2424             w--;
2425         }
2426
2427     }
2428 }
2429
2430 static void
2431 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2432                             pixman_composite_info_t *info)
2433 {
2434     PIXMAN_COMPOSITE_ARGS (info);
2435     uint32_t src;
2436     uint16_t    *dst_line, *dst, d;
2437     int32_t w;
2438     int dst_stride;
2439     __m128i xmm_src, xmm_alpha;
2440     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2441
2442     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2443
2444     if (src == 0)
2445         return;
2446
2447     PIXMAN_IMAGE_GET_LINE (
2448         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2449
2450     xmm_src = expand_pixel_32_1x128 (src);
2451     xmm_alpha = expand_alpha_1x128 (xmm_src);
2452
2453     while (height--)
2454     {
2455         dst = dst_line;
2456
2457         dst_line += dst_stride;
2458         w = width;
2459
2460         while (w && (unsigned long)dst & 15)
2461         {
2462             d = *dst;
2463
2464             *dst++ = pack_565_32_16 (
2465                 pack_1x128_32 (over_1x128 (xmm_src,
2466                                            xmm_alpha,
2467                                            expand565_16_1x128 (d))));
2468             w--;
2469         }
2470
2471         while (w >= 8)
2472         {
2473             xmm_dst = load_128_aligned ((__m128i*)dst);
2474
2475             unpack_565_128_4x128 (xmm_dst,
2476                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2477
2478             over_2x128 (&xmm_src, &xmm_src,
2479                         &xmm_alpha, &xmm_alpha,
2480                         &xmm_dst0, &xmm_dst1);
2481             over_2x128 (&xmm_src, &xmm_src,
2482                         &xmm_alpha, &xmm_alpha,
2483                         &xmm_dst2, &xmm_dst3);
2484
2485             xmm_dst = pack_565_4x128_128 (
2486                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2487
2488             save_128_aligned ((__m128i*)dst, xmm_dst);
2489
2490             dst += 8;
2491             w -= 8;
2492         }
2493
2494         while (w--)
2495         {
2496             d = *dst;
2497             *dst++ = pack_565_32_16 (
2498                 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2499                                            expand565_16_1x128 (d))));
2500         }
2501     }
2502
2503 }
2504
2505 static void
2506 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2507                                    pixman_composite_info_t *info)
2508 {
2509     PIXMAN_COMPOSITE_ARGS (info);
2510     uint32_t src;
2511     uint32_t    *dst_line, d;
2512     uint32_t    *mask_line, m;
2513     uint32_t pack_cmp;
2514     int dst_stride, mask_stride;
2515
2516     __m128i xmm_src;
2517     __m128i xmm_dst;
2518     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2519
2520     __m128i mmx_src, mmx_mask, mmx_dest;
2521
2522     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2523
2524     if (src == 0)
2525         return;
2526
2527     PIXMAN_IMAGE_GET_LINE (
2528         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2529     PIXMAN_IMAGE_GET_LINE (
2530         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2531
2532     xmm_src = _mm_unpacklo_epi8 (
2533         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2534     mmx_src   = xmm_src;
2535
2536     while (height--)
2537     {
2538         int w = width;
2539         const uint32_t *pm = (uint32_t *)mask_line;
2540         uint32_t *pd = (uint32_t *)dst_line;
2541
2542         dst_line += dst_stride;
2543         mask_line += mask_stride;
2544
2545         while (w && (unsigned long)pd & 15)
2546         {
2547             m = *pm++;
2548
2549             if (m)
2550             {
2551                 d = *pd;
2552
2553                 mmx_mask = unpack_32_1x128 (m);
2554                 mmx_dest = unpack_32_1x128 (d);
2555
2556                 *pd = pack_1x128_32 (
2557                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2558                                    mmx_dest));
2559             }
2560
2561             pd++;
2562             w--;
2563         }
2564
2565         while (w >= 4)
2566         {
2567             xmm_mask = load_128_unaligned ((__m128i*)pm);
2568
2569             pack_cmp =
2570                 _mm_movemask_epi8 (
2571                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2572
2573             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2574             if (pack_cmp != 0xffff)
2575             {
2576                 xmm_dst = load_128_aligned ((__m128i*)pd);
2577
2578                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2579
2580                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2581                                     &xmm_mask_lo, &xmm_mask_hi,
2582                                     &xmm_mask_lo, &xmm_mask_hi);
2583                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2584
2585                 save_128_aligned (
2586                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2587             }
2588
2589             pd += 4;
2590             pm += 4;
2591             w -= 4;
2592         }
2593
2594         while (w)
2595         {
2596             m = *pm++;
2597
2598             if (m)
2599             {
2600                 d = *pd;
2601
2602                 mmx_mask = unpack_32_1x128 (m);
2603                 mmx_dest = unpack_32_1x128 (d);
2604
2605                 *pd = pack_1x128_32 (
2606                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2607                                    mmx_dest));
2608             }
2609
2610             pd++;
2611             w--;
2612         }
2613     }
2614
2615 }
2616
2617 static void
2618 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2619                                     pixman_composite_info_t *info)
2620 {
2621     PIXMAN_COMPOSITE_ARGS (info);
2622     uint32_t src;
2623     uint32_t    *dst_line, d;
2624     uint32_t    *mask_line, m;
2625     uint32_t pack_cmp;
2626     int dst_stride, mask_stride;
2627
2628     __m128i xmm_src, xmm_alpha;
2629     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2630     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2631
2632     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2633
2634     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2635
2636     if (src == 0)
2637         return;
2638
2639     PIXMAN_IMAGE_GET_LINE (
2640         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2641     PIXMAN_IMAGE_GET_LINE (
2642         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2643
2644     xmm_src = _mm_unpacklo_epi8 (
2645         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2646     xmm_alpha = expand_alpha_1x128 (xmm_src);
2647     mmx_src   = xmm_src;
2648     mmx_alpha = xmm_alpha;
2649
2650     while (height--)
2651     {
2652         int w = width;
2653         const uint32_t *pm = (uint32_t *)mask_line;
2654         uint32_t *pd = (uint32_t *)dst_line;
2655
2656         dst_line += dst_stride;
2657         mask_line += mask_stride;
2658
2659         while (w && (unsigned long)pd & 15)
2660         {
2661             m = *pm++;
2662
2663             if (m)
2664             {
2665                 d = *pd;
2666                 mmx_mask = unpack_32_1x128 (m);
2667                 mmx_dest = unpack_32_1x128 (d);
2668
2669                 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2670                                                   &mmx_alpha,
2671                                                   &mmx_mask,
2672                                                   &mmx_dest));
2673             }
2674
2675             pd++;
2676             w--;
2677         }
2678
2679         while (w >= 4)
2680         {
2681             xmm_mask = load_128_unaligned ((__m128i*)pm);
2682
2683             pack_cmp =
2684                 _mm_movemask_epi8 (
2685                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2686
2687             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2688             if (pack_cmp != 0xffff)
2689             {
2690                 xmm_dst = load_128_aligned ((__m128i*)pd);
2691
2692                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2693                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2694
2695                 in_over_2x128 (&xmm_src, &xmm_src,
2696                                &xmm_alpha, &xmm_alpha,
2697                                &xmm_mask_lo, &xmm_mask_hi,
2698                                &xmm_dst_lo, &xmm_dst_hi);
2699
2700                 save_128_aligned (
2701                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2702             }
2703
2704             pd += 4;
2705             pm += 4;
2706             w -= 4;
2707         }
2708
2709         while (w)
2710         {
2711             m = *pm++;
2712
2713             if (m)
2714             {
2715                 d = *pd;
2716                 mmx_mask = unpack_32_1x128 (m);
2717                 mmx_dest = unpack_32_1x128 (d);
2718
2719                 *pd = pack_1x128_32 (
2720                     in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2721             }
2722
2723             pd++;
2724             w--;
2725         }
2726     }
2727
2728 }
2729
2730 static void
2731 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2732                                  pixman_composite_info_t *info)
2733 {
2734     PIXMAN_COMPOSITE_ARGS (info);
2735     uint32_t    *dst_line, *dst;
2736     uint32_t    *src_line, *src;
2737     uint32_t mask;
2738     int32_t w;
2739     int dst_stride, src_stride;
2740
2741     __m128i xmm_mask;
2742     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2743     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2744     __m128i xmm_alpha_lo, xmm_alpha_hi;
2745
2746     PIXMAN_IMAGE_GET_LINE (
2747         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2748     PIXMAN_IMAGE_GET_LINE (
2749         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2750
2751     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2752
2753     xmm_mask = create_mask_16_128 (mask >> 24);
2754
2755     while (height--)
2756     {
2757         dst = dst_line;
2758         dst_line += dst_stride;
2759         src = src_line;
2760         src_line += src_stride;
2761         w = width;
2762
2763         while (w && (unsigned long)dst & 15)
2764         {
2765             uint32_t s = *src++;
2766
2767             if (s)
2768             {
2769                 uint32_t d = *dst;
2770                 
2771                 __m128i ms = unpack_32_1x128 (s);
2772                 __m128i alpha    = expand_alpha_1x128 (ms);
2773                 __m128i dest     = xmm_mask;
2774                 __m128i alpha_dst = unpack_32_1x128 (d);
2775                 
2776                 *dst = pack_1x128_32 (
2777                     in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2778             }
2779             dst++;
2780             w--;
2781         }
2782
2783         while (w >= 4)
2784         {
2785             xmm_src = load_128_unaligned ((__m128i*)src);
2786
2787             if (!is_zero (xmm_src))
2788             {
2789                 xmm_dst = load_128_aligned ((__m128i*)dst);
2790                 
2791                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2792                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2793                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2794                                     &xmm_alpha_lo, &xmm_alpha_hi);
2795                 
2796                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2797                                &xmm_alpha_lo, &xmm_alpha_hi,
2798                                &xmm_mask, &xmm_mask,
2799                                &xmm_dst_lo, &xmm_dst_hi);
2800                 
2801                 save_128_aligned (
2802                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2803             }
2804                 
2805             dst += 4;
2806             src += 4;
2807             w -= 4;
2808         }
2809
2810         while (w)
2811         {
2812             uint32_t s = *src++;
2813
2814             if (s)
2815             {
2816                 uint32_t d = *dst;
2817                 
2818                 __m128i ms = unpack_32_1x128 (s);
2819                 __m128i alpha = expand_alpha_1x128 (ms);
2820                 __m128i mask  = xmm_mask;
2821                 __m128i dest  = unpack_32_1x128 (d);
2822                 
2823                 *dst = pack_1x128_32 (
2824                     in_over_1x128 (&ms, &alpha, &mask, &dest));
2825             }
2826
2827             dst++;
2828             w--;
2829         }
2830     }
2831
2832 }
2833
2834 static void
2835 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2836                               pixman_composite_info_t *info)
2837 {
2838     PIXMAN_COMPOSITE_ARGS (info);
2839     uint32_t    *dst_line, *dst;
2840     uint32_t    *src_line, *src;
2841     int32_t w;
2842     int dst_stride, src_stride;
2843
2844
2845     PIXMAN_IMAGE_GET_LINE (
2846         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2847     PIXMAN_IMAGE_GET_LINE (
2848         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2849
2850     while (height--)
2851     {
2852         dst = dst_line;
2853         dst_line += dst_stride;
2854         src = src_line;
2855         src_line += src_stride;
2856         w = width;
2857
2858         while (w && (unsigned long)dst & 15)
2859         {
2860             *dst++ = *src++ | 0xff000000;
2861             w--;
2862         }
2863
2864         while (w >= 16)
2865         {
2866             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2867             
2868             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2869             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2870             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2871             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2872             
2873             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2874             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2875             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2876             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2877             
2878             dst += 16;
2879             src += 16;
2880             w -= 16;
2881         }
2882
2883         while (w)
2884         {
2885             *dst++ = *src++ | 0xff000000;
2886             w--;
2887         }
2888     }
2889
2890 }
2891
2892 static void
2893 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2894                                  pixman_composite_info_t *info)
2895 {
2896     PIXMAN_COMPOSITE_ARGS (info);
2897     uint32_t    *dst_line, *dst;
2898     uint32_t    *src_line, *src;
2899     uint32_t mask;
2900     int dst_stride, src_stride;
2901     int32_t w;
2902
2903     __m128i xmm_mask, xmm_alpha;
2904     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2905     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2906
2907     PIXMAN_IMAGE_GET_LINE (
2908         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2909     PIXMAN_IMAGE_GET_LINE (
2910         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2911
2912     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2913
2914     xmm_mask = create_mask_16_128 (mask >> 24);
2915     xmm_alpha = mask_00ff;
2916
2917     while (height--)
2918     {
2919         dst = dst_line;
2920         dst_line += dst_stride;
2921         src = src_line;
2922         src_line += src_stride;
2923         w = width;
2924
2925         while (w && (unsigned long)dst & 15)
2926         {
2927             uint32_t s = (*src++) | 0xff000000;
2928             uint32_t d = *dst;
2929
2930             __m128i src   = unpack_32_1x128 (s);
2931             __m128i alpha = xmm_alpha;
2932             __m128i mask  = xmm_mask;
2933             __m128i dest  = unpack_32_1x128 (d);
2934
2935             *dst++ = pack_1x128_32 (
2936                 in_over_1x128 (&src, &alpha, &mask, &dest));
2937
2938             w--;
2939         }
2940
2941         while (w >= 4)
2942         {
2943             xmm_src = _mm_or_si128 (
2944                 load_128_unaligned ((__m128i*)src), mask_ff000000);
2945             xmm_dst = load_128_aligned ((__m128i*)dst);
2946
2947             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2948             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2949
2950             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2951                            &xmm_alpha, &xmm_alpha,
2952                            &xmm_mask, &xmm_mask,
2953                            &xmm_dst_lo, &xmm_dst_hi);
2954
2955             save_128_aligned (
2956                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2957
2958             dst += 4;
2959             src += 4;
2960             w -= 4;
2961
2962         }
2963
2964         while (w)
2965         {
2966             uint32_t s = (*src++) | 0xff000000;
2967             uint32_t d = *dst;
2968
2969             __m128i src  = unpack_32_1x128 (s);
2970             __m128i alpha = xmm_alpha;
2971             __m128i mask  = xmm_mask;
2972             __m128i dest  = unpack_32_1x128 (d);
2973
2974             *dst++ = pack_1x128_32 (
2975                 in_over_1x128 (&src, &alpha, &mask, &dest));
2976
2977             w--;
2978         }
2979     }
2980
2981 }
2982
2983 static void
2984 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
2985                                pixman_composite_info_t *info)
2986 {
2987     PIXMAN_COMPOSITE_ARGS (info);
2988     int dst_stride, src_stride;
2989     uint32_t    *dst_line, *dst;
2990     uint32_t    *src_line, *src;
2991
2992     PIXMAN_IMAGE_GET_LINE (
2993         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2994     PIXMAN_IMAGE_GET_LINE (
2995         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2996
2997     dst = dst_line;
2998     src = src_line;
2999
3000     while (height--)
3001     {
3002         sse2_combine_over_u (imp, op, dst, src, NULL, width);
3003
3004         dst += dst_stride;
3005         src += src_stride;
3006     }
3007 }
3008
3009 static force_inline uint16_t
3010 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3011 {
3012     __m128i ms;
3013
3014     ms = unpack_32_1x128 (src);
3015     return pack_565_32_16 (
3016         pack_1x128_32 (
3017             over_1x128 (
3018                 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3019 }
3020
3021 static void
3022 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3023                                pixman_composite_info_t *info)
3024 {
3025     PIXMAN_COMPOSITE_ARGS (info);
3026     uint16_t    *dst_line, *dst, d;
3027     uint32_t    *src_line, *src, s;
3028     int dst_stride, src_stride;
3029     int32_t w;
3030
3031     __m128i xmm_alpha_lo, xmm_alpha_hi;
3032     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3033     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3034
3035     PIXMAN_IMAGE_GET_LINE (
3036         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3037     PIXMAN_IMAGE_GET_LINE (
3038         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3039
3040     while (height--)
3041     {
3042         dst = dst_line;
3043         src = src_line;
3044
3045         dst_line += dst_stride;
3046         src_line += src_stride;
3047         w = width;
3048
3049         /* Align dst on a 16-byte boundary */
3050         while (w &&
3051                ((unsigned long)dst & 15))
3052         {
3053             s = *src++;
3054             d = *dst;
3055
3056             *dst++ = composite_over_8888_0565pixel (s, d);
3057             w--;
3058         }
3059
3060         /* It's a 8 pixel loop */
3061         while (w >= 8)
3062         {
3063             /* I'm loading unaligned because I'm not sure
3064              * about the address alignment.
3065              */
3066             xmm_src = load_128_unaligned ((__m128i*) src);
3067             xmm_dst = load_128_aligned ((__m128i*) dst);
3068
3069             /* Unpacking */
3070             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3071             unpack_565_128_4x128 (xmm_dst,
3072                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3073             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3074                                 &xmm_alpha_lo, &xmm_alpha_hi);
3075
3076             /* I'm loading next 4 pixels from memory
3077              * before to optimze the memory read.
3078              */
3079             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3080
3081             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3082                         &xmm_alpha_lo, &xmm_alpha_hi,
3083                         &xmm_dst0, &xmm_dst1);
3084
3085             /* Unpacking */
3086             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3087             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3088                                 &xmm_alpha_lo, &xmm_alpha_hi);
3089
3090             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3091                         &xmm_alpha_lo, &xmm_alpha_hi,
3092                         &xmm_dst2, &xmm_dst3);
3093
3094             save_128_aligned (
3095                 (__m128i*)dst, pack_565_4x128_128 (
3096                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3097
3098             w -= 8;
3099             dst += 8;
3100             src += 8;
3101         }
3102
3103         while (w--)
3104         {
3105             s = *src++;
3106             d = *dst;
3107
3108             *dst++ = composite_over_8888_0565pixel (s, d);
3109         }
3110     }
3111
3112 }
3113
3114 static void
3115 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3116                               pixman_composite_info_t *info)
3117 {
3118     PIXMAN_COMPOSITE_ARGS (info);
3119     uint32_t src, srca;
3120     uint32_t *dst_line, *dst;
3121     uint8_t *mask_line, *mask;
3122     int dst_stride, mask_stride;
3123     int32_t w;
3124     uint32_t m, d;
3125
3126     __m128i xmm_src, xmm_alpha, xmm_def;
3127     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3128     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3129
3130     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3131
3132     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3133
3134     srca = src >> 24;
3135     if (src == 0)
3136         return;
3137
3138     PIXMAN_IMAGE_GET_LINE (
3139         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3140     PIXMAN_IMAGE_GET_LINE (
3141         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3142
3143     xmm_def = create_mask_2x32_128 (src, src);
3144     xmm_src = expand_pixel_32_1x128 (src);
3145     xmm_alpha = expand_alpha_1x128 (xmm_src);
3146     mmx_src   = xmm_src;
3147     mmx_alpha = xmm_alpha;
3148
3149     while (height--)
3150     {
3151         dst = dst_line;
3152         dst_line += dst_stride;
3153         mask = mask_line;
3154         mask_line += mask_stride;
3155         w = width;
3156
3157         while (w && (unsigned long)dst & 15)
3158         {
3159             uint8_t m = *mask++;
3160
3161             if (m)
3162             {
3163                 d = *dst;
3164                 mmx_mask = expand_pixel_8_1x128 (m);
3165                 mmx_dest = unpack_32_1x128 (d);
3166
3167                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3168                                                    &mmx_alpha,
3169                                                    &mmx_mask,
3170                                                    &mmx_dest));
3171             }
3172
3173             w--;
3174             dst++;
3175         }
3176
3177         while (w >= 4)
3178         {
3179             m = *((uint32_t*)mask);
3180
3181             if (srca == 0xff && m == 0xffffffff)
3182             {
3183                 save_128_aligned ((__m128i*)dst, xmm_def);
3184             }
3185             else if (m)
3186             {
3187                 xmm_dst = load_128_aligned ((__m128i*) dst);
3188                 xmm_mask = unpack_32_1x128 (m);
3189                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3190
3191                 /* Unpacking */
3192                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3193                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3194
3195                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3196                                         &xmm_mask_lo, &xmm_mask_hi);
3197
3198                 in_over_2x128 (&xmm_src, &xmm_src,
3199                                &xmm_alpha, &xmm_alpha,
3200                                &xmm_mask_lo, &xmm_mask_hi,
3201                                &xmm_dst_lo, &xmm_dst_hi);
3202
3203                 save_128_aligned (
3204                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3205             }
3206
3207             w -= 4;
3208             dst += 4;
3209             mask += 4;
3210         }
3211
3212         while (w)
3213         {
3214             uint8_t m = *mask++;
3215
3216             if (m)
3217             {
3218                 d = *dst;
3219                 mmx_mask = expand_pixel_8_1x128 (m);
3220                 mmx_dest = unpack_32_1x128 (d);
3221
3222                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3223                                                    &mmx_alpha,
3224                                                    &mmx_mask,
3225                                                    &mmx_dest));
3226             }
3227
3228             w--;
3229             dst++;
3230         }
3231     }
3232
3233 }
3234
3235 static pixman_bool_t
3236 pixman_fill_sse2 (uint32_t *bits,
3237                   int       stride,
3238                   int       bpp,
3239                   int       x,
3240                   int       y,
3241                   int       width,
3242                   int       height,
3243                   uint32_t  data)
3244 {
3245     uint32_t byte_width;
3246     uint8_t         *byte_line;
3247
3248     __m128i xmm_def;
3249
3250     if (bpp == 8)
3251     {
3252         uint8_t b;
3253         uint16_t w;
3254
3255         stride = stride * (int) sizeof (uint32_t) / 1;
3256         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3257         byte_width = width;
3258         stride *= 1;
3259
3260         b = data & 0xff;
3261         w = (b << 8) | b;
3262         data = (w << 16) | w;
3263     }
3264     else if (bpp == 16)
3265     {
3266         stride = stride * (int) sizeof (uint32_t) / 2;
3267         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3268         byte_width = 2 * width;
3269         stride *= 2;
3270
3271         data = (data & 0xffff) * 0x00010001;
3272     }
3273     else if (bpp == 32)
3274     {
3275         stride = stride * (int) sizeof (uint32_t) / 4;
3276         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3277         byte_width = 4 * width;
3278         stride *= 4;
3279     }
3280     else
3281     {
3282         return FALSE;
3283     }
3284
3285     xmm_def = create_mask_2x32_128 (data, data);
3286
3287     while (height--)
3288     {
3289         int w;
3290         uint8_t *d = byte_line;
3291         byte_line += stride;
3292         w = byte_width;
3293
3294         if (w >= 1 && ((unsigned long)d & 1))
3295         {
3296             *(uint8_t *)d = data;
3297             w -= 1;
3298             d += 1;
3299         }
3300
3301         while (w >= 2 && ((unsigned long)d & 3))
3302         {
3303             *(uint16_t *)d = data;
3304             w -= 2;
3305             d += 2;
3306         }
3307
3308         while (w >= 4 && ((unsigned long)d & 15))
3309         {
3310             *(uint32_t *)d = data;
3311
3312             w -= 4;
3313             d += 4;
3314         }
3315
3316         while (w >= 128)
3317         {
3318             save_128_aligned ((__m128i*)(d),     xmm_def);
3319             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3320             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3321             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3322             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3323             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3324             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3325             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3326
3327             d += 128;
3328             w -= 128;
3329         }
3330
3331         if (w >= 64)
3332         {
3333             save_128_aligned ((__m128i*)(d),     xmm_def);
3334             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3335             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3336             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3337
3338             d += 64;
3339             w -= 64;
3340         }
3341
3342         if (w >= 32)
3343         {
3344             save_128_aligned ((__m128i*)(d),     xmm_def);
3345             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3346
3347             d += 32;
3348             w -= 32;
3349         }
3350
3351         if (w >= 16)
3352         {
3353             save_128_aligned ((__m128i*)(d),     xmm_def);
3354
3355             d += 16;
3356             w -= 16;
3357         }
3358
3359         while (w >= 4)
3360         {
3361             *(uint32_t *)d = data;
3362
3363             w -= 4;
3364             d += 4;
3365         }
3366
3367         if (w >= 2)
3368         {
3369             *(uint16_t *)d = data;
3370             w -= 2;
3371             d += 2;
3372         }
3373
3374         if (w >= 1)
3375         {
3376             *(uint8_t *)d = data;
3377             w -= 1;
3378             d += 1;
3379         }
3380     }
3381
3382     return TRUE;
3383 }
3384
3385 static void
3386 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3387                              pixman_composite_info_t *info)
3388 {
3389     PIXMAN_COMPOSITE_ARGS (info);
3390     uint32_t src, srca;
3391     uint32_t    *dst_line, *dst;
3392     uint8_t     *mask_line, *mask;
3393     int dst_stride, mask_stride;
3394     int32_t w;
3395     uint32_t m;
3396
3397     __m128i xmm_src, xmm_def;
3398     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3399
3400     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3401
3402     srca = src >> 24;
3403     if (src == 0)
3404     {
3405         pixman_fill_sse2 (dest_image->bits.bits, dest_image->bits.rowstride,
3406                           PIXMAN_FORMAT_BPP (dest_image->bits.format),
3407                           dest_x, dest_y, width, height, 0);
3408         return;
3409     }
3410
3411     PIXMAN_IMAGE_GET_LINE (
3412         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3413     PIXMAN_IMAGE_GET_LINE (
3414         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3415
3416     xmm_def = create_mask_2x32_128 (src, src);
3417     xmm_src = expand_pixel_32_1x128 (src);
3418
3419     while (height--)
3420     {
3421         dst = dst_line;
3422         dst_line += dst_stride;
3423         mask = mask_line;
3424         mask_line += mask_stride;
3425         w = width;
3426
3427         while (w && (unsigned long)dst & 15)
3428         {
3429             uint8_t m = *mask++;
3430
3431             if (m)
3432             {
3433                 *dst = pack_1x128_32 (
3434                     pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3435             }
3436             else
3437             {
3438                 *dst = 0;
3439             }
3440
3441             w--;
3442             dst++;
3443         }
3444
3445         while (w >= 4)
3446         {
3447             m = *((uint32_t*)mask);
3448
3449             if (srca == 0xff && m == 0xffffffff)
3450             {
3451                 save_128_aligned ((__m128i*)dst, xmm_def);
3452             }
3453             else if (m)
3454             {
3455                 xmm_mask = unpack_32_1x128 (m);
3456                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3457
3458                 /* Unpacking */
3459                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3460
3461                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3462                                         &xmm_mask_lo, &xmm_mask_hi);
3463
3464                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3465                                     &xmm_mask_lo, &xmm_mask_hi,
3466                                     &xmm_mask_lo, &xmm_mask_hi);
3467
3468                 save_128_aligned (
3469                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3470             }
3471             else
3472             {
3473                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3474             }
3475
3476             w -= 4;
3477             dst += 4;
3478             mask += 4;
3479         }
3480
3481         while (w)
3482         {
3483             uint8_t m = *mask++;
3484
3485             if (m)
3486             {
3487                 *dst = pack_1x128_32 (
3488                     pix_multiply_1x128 (
3489                         xmm_src, expand_pixel_8_1x128 (m)));
3490             }
3491             else
3492             {
3493                 *dst = 0;
3494             }
3495
3496             w--;
3497             dst++;
3498         }
3499     }
3500
3501 }
3502
3503 static void
3504 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3505                               pixman_composite_info_t *info)
3506 {
3507     PIXMAN_COMPOSITE_ARGS (info);
3508     uint32_t src;
3509     uint16_t    *dst_line, *dst, d;
3510     uint8_t     *mask_line, *mask;
3511     int dst_stride, mask_stride;
3512     int32_t w;
3513     uint32_t m;
3514     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3515
3516     __m128i xmm_src, xmm_alpha;
3517     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3518     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3519
3520     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3521
3522     if (src == 0)
3523         return;
3524
3525     PIXMAN_IMAGE_GET_LINE (
3526         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3527     PIXMAN_IMAGE_GET_LINE (
3528         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3529
3530     xmm_src = expand_pixel_32_1x128 (src);
3531     xmm_alpha = expand_alpha_1x128 (xmm_src);
3532     mmx_src = xmm_src;
3533     mmx_alpha = xmm_alpha;
3534
3535     while (height--)
3536     {
3537         dst = dst_line;
3538         dst_line += dst_stride;
3539         mask = mask_line;
3540         mask_line += mask_stride;
3541         w = width;
3542
3543         while (w && (unsigned long)dst & 15)
3544         {
3545             m = *mask++;
3546
3547             if (m)
3548             {
3549                 d = *dst;
3550                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3551                 mmx_dest = expand565_16_1x128 (d);
3552
3553                 *dst = pack_565_32_16 (
3554                     pack_1x128_32 (
3555                         in_over_1x128 (
3556                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3557             }
3558
3559             w--;
3560             dst++;
3561         }
3562
3563         while (w >= 8)
3564         {
3565             xmm_dst = load_128_aligned ((__m128i*) dst);
3566             unpack_565_128_4x128 (xmm_dst,
3567                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3568
3569             m = *((uint32_t*)mask);
3570             mask += 4;
3571
3572             if (m)
3573             {
3574                 xmm_mask = unpack_32_1x128 (m);
3575                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3576
3577                 /* Unpacking */
3578                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3579
3580                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3581                                         &xmm_mask_lo, &xmm_mask_hi);
3582
3583                 in_over_2x128 (&xmm_src, &xmm_src,
3584                                &xmm_alpha, &xmm_alpha,
3585                                &xmm_mask_lo, &xmm_mask_hi,
3586                                &xmm_dst0, &xmm_dst1);
3587             }
3588
3589             m = *((uint32_t*)mask);
3590             mask += 4;
3591
3592             if (m)
3593             {
3594                 xmm_mask = unpack_32_1x128 (m);
3595                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3596
3597                 /* Unpacking */
3598                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3599
3600                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3601                                         &xmm_mask_lo, &xmm_mask_hi);
3602                 in_over_2x128 (&xmm_src, &xmm_src,
3603                                &xmm_alpha, &xmm_alpha,
3604                                &xmm_mask_lo, &xmm_mask_hi,
3605                                &xmm_dst2, &xmm_dst3);
3606             }
3607
3608             save_128_aligned (
3609                 (__m128i*)dst, pack_565_4x128_128 (
3610                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3611
3612             w -= 8;
3613             dst += 8;
3614         }
3615
3616         while (w)
3617         {
3618             m = *mask++;
3619
3620             if (m)
3621             {
3622                 d = *dst;
3623                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3624                 mmx_dest = expand565_16_1x128 (d);
3625
3626                 *dst = pack_565_32_16 (
3627                     pack_1x128_32 (
3628                         in_over_1x128 (
3629                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3630             }
3631
3632             w--;
3633             dst++;
3634         }
3635     }
3636
3637 }
3638
3639 static void
3640 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3641                                  pixman_composite_info_t *info)
3642 {
3643     PIXMAN_COMPOSITE_ARGS (info);
3644     uint16_t    *dst_line, *dst, d;
3645     uint32_t    *src_line, *src, s;
3646     int dst_stride, src_stride;
3647     int32_t w;
3648     uint32_t opaque, zero;
3649
3650     __m128i ms;
3651     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3652     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3653
3654     PIXMAN_IMAGE_GET_LINE (
3655         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3656     PIXMAN_IMAGE_GET_LINE (
3657         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3658
3659     while (height--)
3660     {
3661         dst = dst_line;
3662         dst_line += dst_stride;
3663         src = src_line;
3664         src_line += src_stride;
3665         w = width;
3666
3667         while (w && (unsigned long)dst & 15)
3668         {
3669             s = *src++;
3670             d = *dst;
3671
3672             ms = unpack_32_1x128 (s);
3673
3674             *dst++ = pack_565_32_16 (
3675                 pack_1x128_32 (
3676                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3677             w--;
3678         }
3679
3680         while (w >= 8)
3681         {
3682             /* First round */
3683             xmm_src = load_128_unaligned ((__m128i*)src);
3684             xmm_dst = load_128_aligned  ((__m128i*)dst);
3685
3686             opaque = is_opaque (xmm_src);
3687             zero = is_zero (xmm_src);
3688
3689             unpack_565_128_4x128 (xmm_dst,
3690                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3691             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3692
3693             /* preload next round*/
3694             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3695
3696             if (opaque)
3697             {
3698                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3699                                      &xmm_dst0, &xmm_dst1);
3700             }
3701             else if (!zero)
3702             {
3703                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3704                                         &xmm_dst0, &xmm_dst1);
3705             }
3706
3707             /* Second round */
3708             opaque = is_opaque (xmm_src);
3709             zero = is_zero (xmm_src);
3710
3711             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3712
3713             if (opaque)
3714             {
3715                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3716                                      &xmm_dst2, &xmm_dst3);
3717             }
3718             else if (!zero)
3719             {
3720                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3721                                         &xmm_dst2, &xmm_dst3);
3722             }
3723
3724             save_128_aligned (
3725                 (__m128i*)dst, pack_565_4x128_128 (
3726                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3727
3728             w -= 8;
3729             src += 8;
3730             dst += 8;
3731         }
3732
3733         while (w)
3734         {
3735             s = *src++;
3736             d = *dst;
3737
3738             ms = unpack_32_1x128 (s);
3739
3740             *dst++ = pack_565_32_16 (
3741                 pack_1x128_32 (
3742                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3743             w--;
3744         }
3745     }
3746
3747 }
3748
3749 static void
3750 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3751                                  pixman_composite_info_t *info)
3752 {
3753     PIXMAN_COMPOSITE_ARGS (info);
3754     uint32_t    *dst_line, *dst, d;
3755     uint32_t    *src_line, *src, s;
3756     int dst_stride, src_stride;
3757     int32_t w;
3758     uint32_t opaque, zero;
3759
3760     __m128i xmm_src_lo, xmm_src_hi;
3761     __m128i xmm_dst_lo, xmm_dst_hi;
3762
3763     PIXMAN_IMAGE_GET_LINE (
3764         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3765     PIXMAN_IMAGE_GET_LINE (
3766         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3767
3768     while (height--)
3769     {
3770         dst = dst_line;
3771         dst_line += dst_stride;
3772         src = src_line;
3773         src_line += src_stride;
3774         w = width;
3775
3776         while (w && (unsigned long)dst & 15)
3777         {
3778             s = *src++;
3779             d = *dst;
3780
3781             *dst++ = pack_1x128_32 (
3782                 over_rev_non_pre_1x128 (
3783                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3784
3785             w--;
3786         }
3787
3788         while (w >= 4)
3789         {
3790             xmm_src_hi = load_128_unaligned ((__m128i*)src);
3791
3792             opaque = is_opaque (xmm_src_hi);
3793             zero = is_zero (xmm_src_hi);
3794
3795             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3796
3797             if (opaque)
3798             {
3799                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3800                                      &xmm_dst_lo, &xmm_dst_hi);
3801
3802                 save_128_aligned (
3803                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3804             }
3805             else if (!zero)
3806             {
3807                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
3808
3809                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3810
3811                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3812                                         &xmm_dst_lo, &xmm_dst_hi);
3813
3814                 save_128_aligned (
3815                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3816             }
3817
3818             w -= 4;
3819             dst += 4;
3820             src += 4;
3821         }
3822
3823         while (w)
3824         {
3825             s = *src++;
3826             d = *dst;
3827
3828             *dst++ = pack_1x128_32 (
3829                 over_rev_non_pre_1x128 (
3830                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3831
3832             w--;
3833         }
3834     }
3835
3836 }
3837
3838 static void
3839 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3840                                     pixman_composite_info_t *info)
3841 {
3842     PIXMAN_COMPOSITE_ARGS (info);
3843     uint32_t src;
3844     uint16_t    *dst_line, *dst, d;
3845     uint32_t    *mask_line, *mask, m;
3846     int dst_stride, mask_stride;
3847     int w;
3848     uint32_t pack_cmp;
3849
3850     __m128i xmm_src, xmm_alpha;
3851     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3852     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3853
3854     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3855
3856     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3857
3858     if (src == 0)
3859         return;
3860
3861     PIXMAN_IMAGE_GET_LINE (
3862         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3863     PIXMAN_IMAGE_GET_LINE (
3864         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3865
3866     xmm_src = expand_pixel_32_1x128 (src);
3867     xmm_alpha = expand_alpha_1x128 (xmm_src);
3868     mmx_src = xmm_src;
3869     mmx_alpha = xmm_alpha;
3870
3871     while (height--)
3872     {
3873         w = width;
3874         mask = mask_line;
3875         dst = dst_line;
3876         mask_line += mask_stride;
3877         dst_line += dst_stride;
3878
3879         while (w && ((unsigned long)dst & 15))
3880         {
3881             m = *(uint32_t *) mask;
3882
3883             if (m)
3884             {
3885                 d = *dst;
3886                 mmx_mask = unpack_32_1x128 (m);
3887                 mmx_dest = expand565_16_1x128 (d);
3888
3889                 *dst = pack_565_32_16 (
3890                     pack_1x128_32 (
3891                         in_over_1x128 (
3892                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3893             }
3894
3895             w--;
3896             dst++;
3897             mask++;
3898         }
3899
3900         while (w >= 8)
3901         {
3902             /* First round */
3903             xmm_mask = load_128_unaligned ((__m128i*)mask);
3904             xmm_dst = load_128_aligned ((__m128i*)dst);
3905
3906             pack_cmp = _mm_movemask_epi8 (
3907                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3908
3909             unpack_565_128_4x128 (xmm_dst,
3910                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3911             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3912
3913             /* preload next round */
3914             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
3915
3916             /* preload next round */
3917             if (pack_cmp != 0xffff)
3918             {
3919                 in_over_2x128 (&xmm_src, &xmm_src,
3920                                &xmm_alpha, &xmm_alpha,
3921                                &xmm_mask_lo, &xmm_mask_hi,
3922                                &xmm_dst0, &xmm_dst1);
3923             }
3924
3925             /* Second round */
3926             pack_cmp = _mm_movemask_epi8 (
3927                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3928
3929             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3930
3931             if (pack_cmp != 0xffff)
3932             {
3933                 in_over_2x128 (&xmm_src, &xmm_src,
3934                                &xmm_alpha, &xmm_alpha,
3935                                &xmm_mask_lo, &xmm_mask_hi,
3936                                &xmm_dst2, &xmm_dst3);
3937             }
3938
3939             save_128_aligned (
3940                 (__m128i*)dst, pack_565_4x128_128 (
3941                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3942
3943             w -= 8;
3944             dst += 8;
3945             mask += 8;
3946         }
3947
3948         while (w)
3949         {
3950             m = *(uint32_t *) mask;
3951
3952             if (m)
3953             {
3954                 d = *dst;
3955                 mmx_mask = unpack_32_1x128 (m);
3956                 mmx_dest = expand565_16_1x128 (d);
3957
3958                 *dst = pack_565_32_16 (
3959                     pack_1x128_32 (
3960                         in_over_1x128 (
3961                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3962             }
3963
3964             w--;
3965             dst++;
3966             mask++;
3967         }
3968     }
3969
3970 }
3971
3972 static void
3973 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
3974                          pixman_composite_info_t *info)
3975 {
3976     PIXMAN_COMPOSITE_ARGS (info);
3977     uint8_t     *dst_line, *dst;
3978     uint8_t     *mask_line, *mask;
3979     int dst_stride, mask_stride;
3980     uint32_t d, m;
3981     uint32_t src;
3982     int32_t w;
3983
3984     __m128i xmm_alpha;
3985     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3986     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3987
3988     PIXMAN_IMAGE_GET_LINE (
3989         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3990     PIXMAN_IMAGE_GET_LINE (
3991         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3992
3993     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3994
3995     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
3996
3997     while (height--)
3998     {
3999         dst = dst_line;
4000         dst_line += dst_stride;
4001         mask = mask_line;
4002         mask_line += mask_stride;
4003         w = width;
4004
4005         while (w && ((unsigned long)dst & 15))
4006         {
4007             m = (uint32_t) *mask++;
4008             d = (uint32_t) *dst;
4009
4010             *dst++ = (uint8_t) pack_1x128_32 (
4011                 pix_multiply_1x128 (
4012                     pix_multiply_1x128 (xmm_alpha,
4013                                        unpack_32_1x128 (m)),
4014                     unpack_32_1x128 (d)));
4015             w--;
4016         }
4017
4018         while (w >= 16)
4019         {
4020             xmm_mask = load_128_unaligned ((__m128i*)mask);
4021             xmm_dst = load_128_aligned ((__m128i*)dst);
4022
4023             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4024             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4025
4026             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4027                                 &xmm_mask_lo, &xmm_mask_hi,
4028                                 &xmm_mask_lo, &xmm_mask_hi);
4029
4030             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4031                                 &xmm_dst_lo, &xmm_dst_hi,
4032                                 &xmm_dst_lo, &xmm_dst_hi);
4033
4034             save_128_aligned (
4035                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4036
4037             mask += 16;
4038             dst += 16;
4039             w -= 16;
4040         }
4041
4042         while (w)
4043         {
4044             m = (uint32_t) *mask++;
4045             d = (uint32_t) *dst;
4046
4047             *dst++ = (uint8_t) pack_1x128_32 (
4048                 pix_multiply_1x128 (
4049                     pix_multiply_1x128 (
4050                         xmm_alpha, unpack_32_1x128 (m)),
4051                     unpack_32_1x128 (d)));
4052             w--;
4053         }
4054     }
4055
4056 }
4057
4058 static void
4059 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4060                        pixman_composite_info_t *info)
4061 {
4062     PIXMAN_COMPOSITE_ARGS (info);
4063     uint8_t     *dst_line, *dst;
4064     int dst_stride;
4065     uint32_t d;
4066     uint32_t src;
4067     int32_t w;
4068
4069     __m128i xmm_alpha;
4070     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4071
4072     PIXMAN_IMAGE_GET_LINE (
4073         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4074
4075     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4076
4077     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4078
4079     src = src >> 24;
4080
4081     if (src == 0xff)
4082         return;
4083
4084     if (src == 0x00)
4085     {
4086         pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4087                      8, dest_x, dest_y, width, height, src);
4088
4089         return;
4090     }
4091
4092     while (height--)
4093     {
4094         dst = dst_line;
4095         dst_line += dst_stride;
4096         w = width;
4097
4098         while (w && ((unsigned long)dst & 15))
4099         {
4100             d = (uint32_t) *dst;
4101
4102             *dst++ = (uint8_t) pack_1x128_32 (
4103                 pix_multiply_1x128 (
4104                     xmm_alpha,
4105                     unpack_32_1x128 (d)));
4106             w--;
4107         }
4108
4109         while (w >= 16)
4110         {
4111             xmm_dst = load_128_aligned ((__m128i*)dst);
4112
4113             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4114             
4115             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4116                                 &xmm_dst_lo, &xmm_dst_hi,
4117                                 &xmm_dst_lo, &xmm_dst_hi);
4118
4119             save_128_aligned (
4120                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4121
4122             dst += 16;
4123             w -= 16;
4124         }
4125
4126         while (w)
4127         {
4128             d = (uint32_t) *dst;
4129
4130             *dst++ = (uint8_t) pack_1x128_32 (
4131                 pix_multiply_1x128 (
4132                     xmm_alpha,
4133                     unpack_32_1x128 (d)));
4134             w--;
4135         }
4136     }
4137
4138 }
4139
4140 static void
4141 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4142                        pixman_composite_info_t *info)
4143 {
4144     PIXMAN_COMPOSITE_ARGS (info);
4145     uint8_t     *dst_line, *dst;
4146     uint8_t     *src_line, *src;
4147     int src_stride, dst_stride;
4148     int32_t w;
4149     uint32_t s, d;
4150
4151     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4152     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4153
4154     PIXMAN_IMAGE_GET_LINE (
4155         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4156     PIXMAN_IMAGE_GET_LINE (
4157         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4158
4159     while (height--)
4160     {
4161         dst = dst_line;
4162         dst_line += dst_stride;
4163         src = src_line;
4164         src_line += src_stride;
4165         w = width;
4166
4167         while (w && ((unsigned long)dst & 15))
4168         {
4169             s = (uint32_t) *src++;
4170             d = (uint32_t) *dst;
4171
4172             *dst++ = (uint8_t) pack_1x128_32 (
4173                 pix_multiply_1x128 (
4174                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4175             w--;
4176         }
4177
4178         while (w >= 16)
4179         {
4180             xmm_src = load_128_unaligned ((__m128i*)src);
4181             xmm_dst = load_128_aligned ((__m128i*)dst);
4182
4183             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4184             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4185
4186             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4187                                 &xmm_dst_lo, &xmm_dst_hi,
4188                                 &xmm_dst_lo, &xmm_dst_hi);
4189
4190             save_128_aligned (
4191                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4192
4193             src += 16;
4194             dst += 16;
4195             w -= 16;
4196         }
4197
4198         while (w)
4199         {
4200             s = (uint32_t) *src++;
4201             d = (uint32_t) *dst;
4202
4203             *dst++ = (uint8_t) pack_1x128_32 (
4204                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4205             w--;
4206         }
4207     }
4208
4209 }
4210
4211 static void
4212 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4213                           pixman_composite_info_t *info)
4214 {
4215     PIXMAN_COMPOSITE_ARGS (info);
4216     uint8_t     *dst_line, *dst;
4217     uint8_t     *mask_line, *mask;
4218     int dst_stride, mask_stride;
4219     int32_t w;
4220     uint32_t src;
4221     uint32_t m, d;
4222
4223     __m128i xmm_alpha;
4224     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4225     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4226
4227     PIXMAN_IMAGE_GET_LINE (
4228         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4229     PIXMAN_IMAGE_GET_LINE (
4230         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4231
4232     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4233
4234     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4235
4236     while (height--)
4237     {
4238         dst = dst_line;
4239         dst_line += dst_stride;
4240         mask = mask_line;
4241         mask_line += mask_stride;
4242         w = width;
4243
4244         while (w && ((unsigned long)dst & 15))
4245         {
4246             m = (uint32_t) *mask++;
4247             d = (uint32_t) *dst;
4248
4249             *dst++ = (uint8_t) pack_1x128_32 (
4250                 _mm_adds_epu16 (
4251                     pix_multiply_1x128 (
4252                         xmm_alpha, unpack_32_1x128 (m)),
4253                     unpack_32_1x128 (d)));
4254             w--;
4255         }
4256
4257         while (w >= 16)
4258         {
4259             xmm_mask = load_128_unaligned ((__m128i*)mask);
4260             xmm_dst = load_128_aligned ((__m128i*)dst);
4261
4262             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4263             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4264
4265             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4266                                 &xmm_mask_lo, &xmm_mask_hi,
4267                                 &xmm_mask_lo, &xmm_mask_hi);
4268
4269             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4270             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4271
4272             save_128_aligned (
4273                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4274
4275             mask += 16;
4276             dst += 16;
4277             w -= 16;
4278         }
4279
4280         while (w)
4281         {
4282             m = (uint32_t) *mask++;
4283             d = (uint32_t) *dst;
4284
4285             *dst++ = (uint8_t) pack_1x128_32 (
4286                 _mm_adds_epu16 (
4287                     pix_multiply_1x128 (
4288                         xmm_alpha, unpack_32_1x128 (m)),
4289                     unpack_32_1x128 (d)));
4290
4291             w--;
4292         }
4293     }
4294
4295 }
4296
4297 static void
4298 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4299                         pixman_composite_info_t *info)
4300 {
4301     PIXMAN_COMPOSITE_ARGS (info);
4302     uint8_t     *dst_line, *dst;
4303     int dst_stride;
4304     int32_t w;
4305     uint32_t src;
4306
4307     __m128i xmm_src;
4308
4309     PIXMAN_IMAGE_GET_LINE (
4310         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4311
4312     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4313
4314     src >>= 24;
4315
4316     if (src == 0x00)
4317         return;
4318
4319     if (src == 0xff)
4320     {
4321         pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4322                      8, dest_x, dest_y, width, height, 0xff);
4323
4324         return;
4325     }
4326
4327     src = (src << 24) | (src << 16) | (src << 8) | src;
4328     xmm_src = _mm_set_epi32 (src, src, src, src);
4329
4330     while (height--)
4331     {
4332         dst = dst_line;
4333         dst_line += dst_stride;
4334         w = width;
4335
4336         while (w && ((unsigned long)dst & 15))
4337         {
4338             *dst = (uint8_t)_mm_cvtsi128_si32 (
4339                 _mm_adds_epu8 (
4340                     xmm_src,
4341                     _mm_cvtsi32_si128 (*dst)));
4342
4343             w--;
4344             dst++;
4345         }
4346
4347         while (w >= 16)
4348         {
4349             save_128_aligned (
4350                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4351
4352             dst += 16;
4353             w -= 16;
4354         }
4355
4356         while (w)
4357         {
4358             *dst = (uint8_t)_mm_cvtsi128_si32 (
4359                 _mm_adds_epu8 (
4360                     xmm_src,
4361                     _mm_cvtsi32_si128 (*dst)));
4362
4363             w--;
4364             dst++;
4365         }
4366     }
4367
4368 }
4369
4370 static void
4371 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4372                         pixman_composite_info_t *info)
4373 {
4374     PIXMAN_COMPOSITE_ARGS (info);
4375     uint8_t     *dst_line, *dst;
4376     uint8_t     *src_line, *src;
4377     int dst_stride, src_stride;
4378     int32_t w;
4379     uint16_t t;
4380
4381     PIXMAN_IMAGE_GET_LINE (
4382         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4383     PIXMAN_IMAGE_GET_LINE (
4384         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4385
4386     while (height--)
4387     {
4388         dst = dst_line;
4389         src = src_line;
4390
4391         dst_line += dst_stride;
4392         src_line += src_stride;
4393         w = width;
4394
4395         /* Small head */
4396         while (w && (unsigned long)dst & 3)
4397         {
4398             t = (*dst) + (*src++);
4399             *dst++ = t | (0 - (t >> 8));
4400             w--;
4401         }
4402
4403         sse2_combine_add_u (imp, op,
4404                             (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4405
4406         /* Small tail */
4407         dst += w & 0xfffc;
4408         src += w & 0xfffc;
4409
4410         w &= 3;
4411
4412         while (w)
4413         {
4414             t = (*dst) + (*src++);
4415             *dst++ = t | (0 - (t >> 8));
4416             w--;
4417         }
4418     }
4419
4420 }
4421
4422 static void
4423 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4424                               pixman_composite_info_t *info)
4425 {
4426     PIXMAN_COMPOSITE_ARGS (info);
4427     uint32_t    *dst_line, *dst;
4428     uint32_t    *src_line, *src;
4429     int dst_stride, src_stride;
4430
4431     PIXMAN_IMAGE_GET_LINE (
4432         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4433     PIXMAN_IMAGE_GET_LINE (
4434         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4435
4436     while (height--)
4437     {
4438         dst = dst_line;
4439         dst_line += dst_stride;
4440         src = src_line;
4441         src_line += src_stride;
4442
4443         sse2_combine_add_u (imp, op, dst, src, NULL, width);
4444     }
4445
4446 }
4447
4448 static pixman_bool_t
4449 pixman_blt_sse2 (uint32_t *src_bits,
4450                  uint32_t *dst_bits,
4451                  int       src_stride,
4452                  int       dst_stride,
4453                  int       src_bpp,
4454                  int       dst_bpp,
4455                  int       src_x,
4456                  int       src_y,
4457                  int       dest_x,
4458                  int       dest_y,
4459                  int       width,
4460                  int       height)
4461 {
4462     uint8_t *   src_bytes;
4463     uint8_t *   dst_bytes;
4464     int byte_width;
4465
4466     if (src_bpp != dst_bpp)
4467         return FALSE;
4468
4469     if (src_bpp == 16)
4470     {
4471         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4472         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4473         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4474         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4475         byte_width = 2 * width;
4476         src_stride *= 2;
4477         dst_stride *= 2;
4478     }
4479     else if (src_bpp == 32)
4480     {
4481         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4482         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4483         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4484         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4485         byte_width = 4 * width;
4486         src_stride *= 4;
4487         dst_stride *= 4;
4488     }
4489     else
4490     {
4491         return FALSE;
4492     }
4493
4494     while (height--)
4495     {
4496         int w;
4497         uint8_t *s = src_bytes;
4498         uint8_t *d = dst_bytes;
4499         src_bytes += src_stride;
4500         dst_bytes += dst_stride;
4501         w = byte_width;
4502
4503         while (w >= 2 && ((unsigned long)d & 3))
4504         {
4505             *(uint16_t *)d = *(uint16_t *)s;
4506             w -= 2;
4507             s += 2;
4508             d += 2;
4509         }
4510
4511         while (w >= 4 && ((unsigned long)d & 15))
4512         {
4513             *(uint32_t *)d = *(uint32_t *)s;
4514
4515             w -= 4;
4516             s += 4;
4517             d += 4;
4518         }
4519
4520         while (w >= 64)
4521         {
4522             __m128i xmm0, xmm1, xmm2, xmm3;
4523
4524             xmm0 = load_128_unaligned ((__m128i*)(s));
4525             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4526             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4527             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4528
4529             save_128_aligned ((__m128i*)(d),    xmm0);
4530             save_128_aligned ((__m128i*)(d + 16), xmm1);
4531             save_128_aligned ((__m128i*)(d + 32), xmm2);
4532             save_128_aligned ((__m128i*)(d + 48), xmm3);
4533
4534             s += 64;
4535             d += 64;
4536             w -= 64;
4537         }
4538
4539         while (w >= 16)
4540         {
4541             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4542
4543             w -= 16;
4544             d += 16;
4545             s += 16;
4546         }
4547
4548         while (w >= 4)
4549         {
4550             *(uint32_t *)d = *(uint32_t *)s;
4551
4552             w -= 4;
4553             s += 4;
4554             d += 4;
4555         }
4556
4557         if (w >= 2)
4558         {
4559             *(uint16_t *)d = *(uint16_t *)s;
4560             w -= 2;
4561             s += 2;
4562             d += 2;
4563         }
4564     }
4565
4566
4567     return TRUE;
4568 }
4569
4570 static void
4571 sse2_composite_copy_area (pixman_implementation_t *imp,
4572                           pixman_composite_info_t *info)
4573 {
4574     PIXMAN_COMPOSITE_ARGS (info);
4575     pixman_blt_sse2 (src_image->bits.bits,
4576                      dest_image->bits.bits,
4577                      src_image->bits.rowstride,
4578                      dest_image->bits.rowstride,
4579                      PIXMAN_FORMAT_BPP (src_image->bits.format),
4580                      PIXMAN_FORMAT_BPP (dest_image->bits.format),
4581                      src_x, src_y, dest_x, dest_y, width, height);
4582 }
4583
4584 static void
4585 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4586                                  pixman_composite_info_t *info)
4587 {
4588     PIXMAN_COMPOSITE_ARGS (info);
4589     uint32_t    *src, *src_line, s;
4590     uint32_t    *dst, *dst_line, d;
4591     uint8_t         *mask, *mask_line;
4592     uint32_t m;
4593     int src_stride, mask_stride, dst_stride;
4594     int32_t w;
4595     __m128i ms;
4596
4597     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4598     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4599     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4600
4601     PIXMAN_IMAGE_GET_LINE (
4602         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4603     PIXMAN_IMAGE_GET_LINE (
4604         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4605     PIXMAN_IMAGE_GET_LINE (
4606         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4607
4608     while (height--)
4609     {
4610         src = src_line;
4611         src_line += src_stride;
4612         dst = dst_line;
4613         dst_line += dst_stride;
4614         mask = mask_line;
4615         mask_line += mask_stride;
4616
4617         w = width;
4618
4619         while (w && (unsigned long)dst & 15)
4620         {
4621             s = 0xff000000 | *src++;
4622             m = (uint32_t) *mask++;
4623             d = *dst;
4624             ms = unpack_32_1x128 (s);
4625
4626             if (m != 0xff)
4627             {
4628                 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4629                 __m128i md = unpack_32_1x128 (d);
4630
4631                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4632             }
4633
4634             *dst++ = pack_1x128_32 (ms);
4635             w--;
4636         }
4637
4638         while (w >= 4)
4639         {
4640             m = *(uint32_t*) mask;
4641             xmm_src = _mm_or_si128 (
4642                 load_128_unaligned ((__m128i*)src), mask_ff000000);
4643
4644             if (m == 0xffffffff)
4645             {
4646                 save_128_aligned ((__m128i*)dst, xmm_src);
4647             }
4648             else
4649             {
4650                 xmm_dst = load_128_aligned ((__m128i*)dst);
4651
4652                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4653
4654                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4655                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4656                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4657
4658                 expand_alpha_rev_2x128 (
4659                     xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4660
4661                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4662                                &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4663                                &xmm_dst_lo, &xmm_dst_hi);
4664
4665                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4666             }
4667
4668             src += 4;
4669             dst += 4;
4670             mask += 4;
4671             w -= 4;
4672         }
4673
4674         while (w)
4675         {
4676             m = (uint32_t) *mask++;
4677
4678             if (m)
4679             {
4680                 s = 0xff000000 | *src;
4681
4682                 if (m == 0xff)
4683                 {
4684                     *dst = s;
4685                 }
4686                 else
4687                 {
4688                     __m128i ma, md, ms;
4689
4690                     d = *dst;
4691
4692                     ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4693                     md = unpack_32_1x128 (d);
4694                     ms = unpack_32_1x128 (s);
4695
4696                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4697                 }
4698
4699             }
4700
4701             src++;
4702             dst++;
4703             w--;
4704         }
4705     }
4706
4707 }
4708
4709 static void
4710 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4711                                  pixman_composite_info_t *info)
4712 {
4713     PIXMAN_COMPOSITE_ARGS (info);
4714     uint32_t    *src, *src_line, s;
4715     uint32_t    *dst, *dst_line, d;
4716     uint8_t         *mask, *mask_line;
4717     uint32_t m;
4718     int src_stride, mask_stride, dst_stride;
4719     int32_t w;
4720
4721     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4722     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4723     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4724
4725     PIXMAN_IMAGE_GET_LINE (
4726         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4727     PIXMAN_IMAGE_GET_LINE (
4728         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4729     PIXMAN_IMAGE_GET_LINE (
4730         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4731
4732     while (height--)
4733     {
4734         src = src_line;
4735         src_line += src_stride;
4736         dst = dst_line;
4737         dst_line += dst_stride;
4738         mask = mask_line;
4739         mask_line += mask_stride;
4740
4741         w = width;
4742
4743         while (w && (unsigned long)dst & 15)
4744         {
4745             uint32_t sa;
4746
4747             s = *src++;
4748             m = (uint32_t) *mask++;
4749             d = *dst;
4750
4751             sa = s >> 24;
4752
4753             if (m)
4754             {
4755                 if (sa == 0xff && m == 0xff)
4756                 {
4757                     *dst = s;
4758                 }
4759                 else
4760                 {
4761                     __m128i ms, md, ma, msa;
4762
4763                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
4764                     ms = unpack_32_1x128 (s);
4765                     md = unpack_32_1x128 (d);
4766
4767                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
4768
4769                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
4770                 }
4771             }
4772
4773             dst++;
4774             w--;
4775         }
4776
4777         while (w >= 4)
4778         {
4779             m = *(uint32_t *) mask;
4780
4781             if (m)
4782             {
4783                 xmm_src = load_128_unaligned ((__m128i*)src);
4784
4785                 if (m == 0xffffffff && is_opaque (xmm_src))
4786                 {
4787                     save_128_aligned ((__m128i *)dst, xmm_src);
4788                 }
4789                 else
4790                 {
4791                     xmm_dst = load_128_aligned ((__m128i *)dst);
4792
4793                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4794
4795                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4796                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4797                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4798
4799                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
4800                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4801
4802                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
4803                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
4804
4805                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4806                 }
4807             }
4808
4809             src += 4;
4810             dst += 4;
4811             mask += 4;
4812             w -= 4;
4813         }
4814
4815         while (w)
4816         {
4817             uint32_t sa;
4818
4819             s = *src++;
4820             m = (uint32_t) *mask++;
4821             d = *dst;
4822
4823             sa = s >> 24;
4824
4825             if (m)
4826             {
4827                 if (sa == 0xff && m == 0xff)
4828                 {
4829                     *dst = s;
4830                 }
4831                 else
4832                 {
4833                     __m128i ms, md, ma, msa;
4834
4835                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
4836                     ms = unpack_32_1x128 (s);
4837                     md = unpack_32_1x128 (d);
4838
4839                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
4840
4841                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
4842                 }
4843             }
4844
4845             dst++;
4846             w--;
4847         }
4848     }
4849
4850 }
4851
4852 static void
4853 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
4854                                     pixman_composite_info_t *info)
4855 {
4856     PIXMAN_COMPOSITE_ARGS (info);
4857     uint32_t src;
4858     uint32_t    *dst_line, *dst;
4859     __m128i xmm_src;
4860     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4861     __m128i xmm_dsta_hi, xmm_dsta_lo;
4862     int dst_stride;
4863     int32_t w;
4864
4865     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4866
4867     if (src == 0)
4868         return;
4869
4870     PIXMAN_IMAGE_GET_LINE (
4871         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4872
4873     xmm_src = expand_pixel_32_1x128 (src);
4874
4875     while (height--)
4876     {
4877         dst = dst_line;
4878
4879         dst_line += dst_stride;
4880         w = width;
4881
4882         while (w && (unsigned long)dst & 15)
4883         {
4884             __m128i vd;
4885
4886             vd = unpack_32_1x128 (*dst);
4887
4888             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
4889                                               xmm_src));
4890             w--;
4891             dst++;
4892         }
4893
4894         while (w >= 4)
4895         {
4896             __m128i tmp_lo, tmp_hi;
4897
4898             xmm_dst = load_128_aligned ((__m128i*)dst);
4899
4900             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4901             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
4902
4903             tmp_lo = xmm_src;
4904             tmp_hi = xmm_src;
4905
4906             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
4907                         &xmm_dsta_lo, &xmm_dsta_hi,
4908                         &tmp_lo, &tmp_hi);
4909
4910             save_128_aligned (
4911                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
4912
4913             w -= 4;
4914             dst += 4;
4915         }
4916
4917         while (w)
4918         {
4919             __m128i vd;
4920
4921             vd = unpack_32_1x128 (*dst);
4922
4923             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
4924                                               xmm_src));
4925             w--;
4926             dst++;
4927         }
4928
4929     }
4930
4931 }
4932
4933 static void
4934 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
4935                                     pixman_composite_info_t *info)
4936 {
4937     PIXMAN_COMPOSITE_ARGS (info);
4938     uint32_t    *src, *src_line, s;
4939     uint32_t    *dst, *dst_line, d;
4940     uint32_t    *mask, *mask_line;
4941     uint32_t    m;
4942     int src_stride, mask_stride, dst_stride;
4943     int32_t w;
4944
4945     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4946     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4947     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4948
4949     PIXMAN_IMAGE_GET_LINE (
4950         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4951     PIXMAN_IMAGE_GET_LINE (
4952         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4953     PIXMAN_IMAGE_GET_LINE (
4954         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4955
4956     while (height--)
4957     {
4958         src = src_line;
4959         src_line += src_stride;
4960         dst = dst_line;
4961         dst_line += dst_stride;
4962         mask = mask_line;
4963         mask_line += mask_stride;
4964
4965         w = width;
4966
4967         while (w && (unsigned long)dst & 15)
4968         {
4969             uint32_t sa;
4970
4971             s = *src++;
4972             m = (*mask++) >> 24;
4973             d = *dst;
4974
4975             sa = s >> 24;
4976
4977             if (m)
4978             {
4979                 if (sa == 0xff && m == 0xff)
4980                 {
4981                     *dst = s;
4982                 }
4983                 else
4984                 {
4985                     __m128i ms, md, ma, msa;
4986
4987                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
4988                     ms = unpack_32_1x128 (s);
4989                     md = unpack_32_1x128 (d);
4990
4991                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
4992
4993                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
4994                 }
4995             }
4996
4997             dst++;
4998             w--;
4999         }
5000
5001         while (w >= 4)
5002         {
5003             xmm_mask = load_128_unaligned ((__m128i*)mask);
5004
5005             if (!is_transparent (xmm_mask))
5006             {
5007                 xmm_src = load_128_unaligned ((__m128i*)src);
5008
5009                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5010                 {
5011                     save_128_aligned ((__m128i *)dst, xmm_src);
5012                 }
5013                 else
5014                 {
5015                     xmm_dst = load_128_aligned ((__m128i *)dst);
5016
5017                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5018                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5019                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5020
5021                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5022                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5023
5024                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5025                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5026
5027                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5028                 }
5029             }
5030
5031             src += 4;
5032             dst += 4;
5033             mask += 4;
5034             w -= 4;
5035         }
5036
5037         while (w)
5038         {
5039             uint32_t sa;
5040
5041             s = *src++;
5042             m = (*mask++) >> 24;
5043             d = *dst;
5044
5045             sa = s >> 24;
5046
5047             if (m)
5048             {
5049                 if (sa == 0xff && m == 0xff)
5050                 {
5051                     *dst = s;
5052                 }
5053                 else
5054                 {
5055                     __m128i ms, md, ma, msa;
5056
5057                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5058                     ms = unpack_32_1x128 (s);
5059                     md = unpack_32_1x128 (d);
5060
5061                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5062
5063                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5064                 }
5065             }
5066
5067             dst++;
5068             w--;
5069         }
5070     }
5071
5072 }
5073
5074 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5075 static force_inline void
5076 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5077                                              const uint32_t* ps,
5078                                              int32_t         w,
5079                                              pixman_fixed_t  vx,
5080                                              pixman_fixed_t  unit_x,
5081                                              pixman_fixed_t  max_vx,
5082                                              pixman_bool_t   fully_transparent_src)
5083 {
5084     uint32_t s, d;
5085     const uint32_t* pm = NULL;
5086
5087     __m128i xmm_dst_lo, xmm_dst_hi;
5088     __m128i xmm_src_lo, xmm_src_hi;
5089     __m128i xmm_alpha_lo, xmm_alpha_hi;
5090
5091     if (fully_transparent_src)
5092         return;
5093
5094     /* Align dst on a 16-byte boundary */
5095     while (w && ((unsigned long)pd & 15))
5096     {
5097         d = *pd;
5098         s = combine1 (ps + (vx >> 16), pm);
5099         vx += unit_x;
5100
5101         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5102         if (pm)
5103             pm++;
5104         w--;
5105     }
5106
5107     while (w >= 4)
5108     {
5109         __m128i tmp;
5110         uint32_t tmp1, tmp2, tmp3, tmp4;
5111
5112         tmp1 = ps[vx >> 16];
5113         vx += unit_x;
5114         tmp2 = ps[vx >> 16];
5115         vx += unit_x;
5116         tmp3 = ps[vx >> 16];
5117         vx += unit_x;
5118         tmp4 = ps[vx >> 16];
5119         vx += unit_x;
5120
5121         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5122
5123         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5124
5125         if (is_opaque (xmm_src_hi))
5126         {
5127             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5128         }
5129         else if (!is_zero (xmm_src_hi))
5130         {
5131             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5132
5133             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5134             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5135
5136             expand_alpha_2x128 (
5137                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5138
5139             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5140                         &xmm_alpha_lo, &xmm_alpha_hi,
5141                         &xmm_dst_lo, &xmm_dst_hi);
5142
5143             /* rebuid the 4 pixel data and save*/
5144             save_128_aligned ((__m128i*)pd,
5145                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5146         }
5147
5148         w -= 4;
5149         pd += 4;
5150         if (pm)
5151             pm += 4;
5152     }
5153
5154     while (w)
5155     {
5156         d = *pd;
5157         s = combine1 (ps + (vx >> 16), pm);
5158         vx += unit_x;
5159
5160         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5161         if (pm)
5162             pm++;
5163
5164         w--;
5165     }
5166 }
5167
5168 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5169                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5170                        uint32_t, uint32_t, COVER)
5171 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5172                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5173                        uint32_t, uint32_t, NONE)
5174 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5175                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5176                        uint32_t, uint32_t, PAD)
5177
5178 static force_inline void
5179 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5180                                                uint32_t *       dst,
5181                                                const uint32_t * src,
5182                                                int32_t          w,
5183                                                pixman_fixed_t   vx,
5184                                                pixman_fixed_t   unit_x,
5185                                                pixman_fixed_t   max_vx,
5186                                                pixman_bool_t    zero_src)
5187 {
5188     __m128i xmm_mask;
5189     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5190     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5191     __m128i xmm_alpha_lo, xmm_alpha_hi;
5192
5193     if (zero_src || (*mask >> 24) == 0)
5194         return;
5195
5196     xmm_mask = create_mask_16_128 (*mask >> 24);
5197
5198     while (w && (unsigned long)dst & 15)
5199     {
5200         uint32_t s = src[pixman_fixed_to_int (vx)];
5201         vx += unit_x;
5202
5203         if (s)
5204         {
5205             uint32_t d = *dst;
5206
5207             __m128i ms = unpack_32_1x128 (s);
5208             __m128i alpha     = expand_alpha_1x128 (ms);
5209             __m128i dest      = xmm_mask;
5210             __m128i alpha_dst = unpack_32_1x128 (d);
5211
5212             *dst = pack_1x128_32 (
5213                 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5214         }
5215         dst++;
5216         w--;
5217     }
5218
5219     while (w >= 4)
5220     {
5221         uint32_t tmp1, tmp2, tmp3, tmp4;
5222
5223         tmp1 = src[pixman_fixed_to_int (vx)];
5224         vx += unit_x;
5225         tmp2 = src[pixman_fixed_to_int (vx)];
5226         vx += unit_x;
5227         tmp3 = src[pixman_fixed_to_int (vx)];
5228         vx += unit_x;
5229         tmp4 = src[pixman_fixed_to_int (vx)];
5230         vx += unit_x;
5231
5232         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5233
5234         if (!is_zero (xmm_src))
5235         {
5236             xmm_dst = load_128_aligned ((__m128i*)dst);
5237
5238             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5239             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5240             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5241                                 &xmm_alpha_lo, &xmm_alpha_hi);
5242
5243             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5244                            &xmm_alpha_lo, &xmm_alpha_hi,
5245                            &xmm_mask, &xmm_mask,
5246                            &xmm_dst_lo, &xmm_dst_hi);
5247
5248             save_128_aligned (
5249                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5250         }
5251
5252         dst += 4;
5253         w -= 4;
5254     }
5255
5256     while (w)
5257     {
5258         uint32_t s = src[pixman_fixed_to_int (vx)];
5259         vx += unit_x;
5260
5261         if (s)
5262         {
5263             uint32_t d = *dst;
5264
5265             __m128i ms = unpack_32_1x128 (s);
5266             __m128i alpha = expand_alpha_1x128 (ms);
5267             __m128i mask  = xmm_mask;
5268             __m128i dest  = unpack_32_1x128 (d);
5269
5270             *dst = pack_1x128_32 (
5271                 in_over_1x128 (&ms, &alpha, &mask, &dest));
5272         }
5273
5274         dst++;
5275         w--;
5276     }
5277
5278 }
5279
5280 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5281                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5282                               uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5283 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5284                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5285                               uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5286 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5287                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5288                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5289
5290 #define BILINEAR_DECLARE_VARIABLES                                              \
5291     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);      \
5292     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);      \
5293     const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\
5294     const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);            \
5295     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,       \
5296                                           unit_x, unit_x, unit_x, unit_x);      \
5297     const __m128i xmm_zero = _mm_setzero_si128 ();                              \
5298     __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
5299
5300 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)                                     \
5301 do {                                                                            \
5302     __m128i xmm_wh, xmm_lo, xmm_hi, a;                                          \
5303     /* fetch 2x2 pixel block into sse2 register */                              \
5304     uint32_t tl = src_top [pixman_fixed_to_int (vx)];                           \
5305     uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];                       \
5306     uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];                        \
5307     uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];                    \
5308     a = _mm_set_epi32 (tr, tl, br, bl);                                         \
5309     vx += unit_x;                                                               \
5310     /* vertical interpolation */                                                \
5311     a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),        \
5312                                         xmm_wt),                                \
5313                        _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),        \
5314                                         xmm_wb));                               \
5315     /* calculate horizontal weights */                                          \
5316     xmm_wh = _mm_add_epi16 (xmm_addc,                                           \
5317                             _mm_xor_si128 (xmm_xorc,                            \
5318                                            _mm_srli_epi16 (xmm_x, 8)));         \
5319     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                      \
5320     /* horizontal interpolation */                                              \
5321     xmm_lo = _mm_mullo_epi16 (a, xmm_wh);                                       \
5322     xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);                                       \
5323     a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),                     \
5324                        _mm_unpackhi_epi16 (xmm_lo, xmm_hi));                    \
5325     /* shift and pack the result */                                             \
5326     a = _mm_srli_epi32 (a, 16);                                                 \
5327     a = _mm_packs_epi32 (a, a);                                                 \
5328     a = _mm_packus_epi16 (a, a);                                                \
5329     pix = _mm_cvtsi128_si32 (a);                                                \
5330 } while (0)
5331
5332 #define BILINEAR_SKIP_ONE_PIXEL()                                               \
5333 do {                                                                            \
5334     vx += unit_x;                                                               \
5335     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                      \
5336 } while(0)
5337
5338 static force_inline void
5339 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
5340                                              const uint32_t * mask,
5341                                              const uint32_t * src_top,
5342                                              const uint32_t * src_bottom,
5343                                              int32_t          w,
5344                                              int              wt,
5345                                              int              wb,
5346                                              pixman_fixed_t   vx,
5347                                              pixman_fixed_t   unit_x,
5348                                              pixman_fixed_t   max_vx,
5349                                              pixman_bool_t    zero_src)
5350 {
5351     BILINEAR_DECLARE_VARIABLES;
5352     uint32_t pix1, pix2, pix3, pix4;
5353
5354     while ((w -= 4) >= 0)
5355     {
5356         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5357         BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5358         BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5359         BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5360         *dst++ = pix1;
5361         *dst++ = pix2;
5362         *dst++ = pix3;
5363         *dst++ = pix4;
5364     }
5365
5366     if (w & 2)
5367     {
5368         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5369         BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5370         *dst++ = pix1;
5371         *dst++ = pix2;
5372     }
5373
5374     if (w & 1)
5375     {
5376         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5377         *dst = pix1;
5378     }
5379
5380 }
5381
5382 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5383                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5384                                uint32_t, uint32_t, uint32_t,
5385                                COVER, FLAG_NONE)
5386 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5387                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5388                                uint32_t, uint32_t, uint32_t,
5389                                PAD, FLAG_NONE)
5390 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5391                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5392                                uint32_t, uint32_t, uint32_t,
5393                                NONE, FLAG_NONE)
5394 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5395                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5396                                uint32_t, uint32_t, uint32_t,
5397                                NORMAL, FLAG_NONE)
5398
5399 static force_inline void
5400 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
5401                                               const uint32_t * mask,
5402                                               const uint32_t * src_top,
5403                                               const uint32_t * src_bottom,
5404                                               int32_t          w,
5405                                               int              wt,
5406                                               int              wb,
5407                                               pixman_fixed_t   vx,
5408                                               pixman_fixed_t   unit_x,
5409                                               pixman_fixed_t   max_vx,
5410                                               pixman_bool_t    zero_src)
5411 {
5412     BILINEAR_DECLARE_VARIABLES;
5413     uint32_t pix1, pix2, pix3, pix4;
5414
5415     while (w && ((unsigned long)dst & 15))
5416     {
5417         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5418
5419         if (pix1)
5420         {
5421             pix2 = *dst;
5422             *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5423         }
5424
5425         w--;
5426         dst++;
5427     }
5428
5429     while (w  >= 4)
5430     {
5431         __m128i xmm_src;
5432         __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5433         __m128i xmm_alpha_hi, xmm_alpha_lo;
5434
5435         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5436         BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5437         BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5438         BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5439
5440         xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5441
5442         if (!is_zero (xmm_src))
5443         {
5444             if (is_opaque (xmm_src))
5445             {
5446                 save_128_aligned ((__m128i *)dst, xmm_src);
5447             }
5448             else
5449             {
5450                 __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5451
5452                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5453                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5454
5455                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5456                 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5457                             &xmm_dst_lo, &xmm_dst_hi);
5458
5459                 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5460             }
5461         }
5462
5463         w -= 4;
5464         dst += 4;
5465     }
5466
5467     while (w)
5468     {
5469         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5470
5471         if (pix1)
5472         {
5473             pix2 = *dst;
5474             *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5475         }
5476
5477         w--;
5478         dst++;
5479     }
5480 }
5481
5482 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5483                                scaled_bilinear_scanline_sse2_8888_8888_OVER,
5484                                uint32_t, uint32_t, uint32_t,
5485                                COVER, FLAG_NONE)
5486 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5487                                scaled_bilinear_scanline_sse2_8888_8888_OVER,
5488                                uint32_t, uint32_t, uint32_t,
5489                                PAD, FLAG_NONE)
5490 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5491                                scaled_bilinear_scanline_sse2_8888_8888_OVER,
5492                                uint32_t, uint32_t, uint32_t,
5493                                NONE, FLAG_NONE)
5494 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5495                                scaled_bilinear_scanline_sse2_8888_8888_OVER,
5496                                uint32_t, uint32_t, uint32_t,
5497                                NORMAL, FLAG_NONE)
5498
5499 static force_inline void
5500 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
5501                                                 const uint8_t  * mask,
5502                                                 const uint32_t * src_top,
5503                                                 const uint32_t * src_bottom,
5504                                                 int32_t          w,
5505                                                 int              wt,
5506                                                 int              wb,
5507                                                 pixman_fixed_t   vx,
5508                                                 pixman_fixed_t   unit_x,
5509                                                 pixman_fixed_t   max_vx,
5510                                                 pixman_bool_t    zero_src)
5511 {
5512     BILINEAR_DECLARE_VARIABLES;
5513     uint32_t pix1, pix2, pix3, pix4;
5514     uint32_t m;
5515
5516     while (w && ((unsigned long)dst & 15))
5517     {
5518         uint32_t sa;
5519
5520         m = (uint32_t) *mask++;
5521
5522         if (m)
5523         {
5524             BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5525             sa = pix1 >> 24;
5526
5527             if (sa == 0xff && m == 0xff)
5528             {
5529                 *dst = pix1;
5530             }
5531             else
5532             {
5533                 __m128i ms, md, ma, msa;
5534
5535                 pix2 = *dst;
5536                 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5537                 ms = unpack_32_1x128 (pix1);
5538                 md = unpack_32_1x128 (pix2);
5539
5540                 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5541
5542                 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5543             }
5544         }
5545         else
5546         {
5547             BILINEAR_SKIP_ONE_PIXEL ();
5548         }
5549
5550         w--;
5551         dst++;
5552     }
5553
5554     while (w >= 4)
5555     {
5556         __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5557         __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5558         __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5559
5560         m = *(uint32_t*)mask;
5561
5562         if (m)
5563         {
5564             BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5565             BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5566             BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5567             BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5568
5569             xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5570
5571             if (m == 0xffffffff && is_opaque (xmm_src))
5572             {
5573                 save_128_aligned ((__m128i *)dst, xmm_src);
5574             }
5575             else
5576             {
5577                 xmm_dst = load_128_aligned ((__m128i *)dst);
5578
5579                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5580
5581                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5582                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5583                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5584
5585                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5586                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5587
5588                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5589                                &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5590
5591                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5592             }
5593         }
5594         else
5595         {
5596             BILINEAR_SKIP_ONE_PIXEL ();
5597             BILINEAR_SKIP_ONE_PIXEL ();
5598             BILINEAR_SKIP_ONE_PIXEL ();
5599             BILINEAR_SKIP_ONE_PIXEL ();
5600         }
5601
5602         w -= 4;
5603         dst += 4;
5604         mask += 4;
5605     }
5606
5607     while (w)
5608     {
5609         uint32_t sa;
5610
5611         m = (uint32_t) *mask++;
5612
5613         if (m)
5614         {
5615             BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5616             sa = pix1 >> 24;
5617
5618             if (sa == 0xff && m == 0xff)
5619             {
5620                 *dst = pix1;
5621             }
5622             else
5623             {
5624                 __m128i ms, md, ma, msa;
5625
5626                 pix2 = *dst;
5627                 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5628                 ms = unpack_32_1x128 (pix1);
5629                 md = unpack_32_1x128 (pix2);
5630
5631                 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5632
5633                 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5634             }
5635         }
5636         else
5637         {
5638             BILINEAR_SKIP_ONE_PIXEL ();
5639         }
5640
5641         w--;
5642         dst++;
5643     }
5644 }
5645
5646 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
5647                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5648                                uint32_t, uint8_t, uint32_t,
5649                                COVER, FLAG_HAVE_NON_SOLID_MASK)
5650 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
5651                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5652                                uint32_t, uint8_t, uint32_t,
5653                                PAD, FLAG_HAVE_NON_SOLID_MASK)
5654 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
5655                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5656                                uint32_t, uint8_t, uint32_t,
5657                                NONE, FLAG_HAVE_NON_SOLID_MASK)
5658 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
5659                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5660                                uint32_t, uint8_t, uint32_t,
5661                                NORMAL, FLAG_HAVE_NON_SOLID_MASK)
5662
5663 static const pixman_fast_path_t sse2_fast_paths[] =
5664 {
5665     /* PIXMAN_OP_OVER */
5666     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5667     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5668     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5669     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5670     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5671     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5672     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5673     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5674     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5675     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5676     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5677     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5678     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5679     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5680     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5681     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5682     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5683     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5684     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5685     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5686     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5687     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5688     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5689     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5690     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5691     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5692     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5693     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5694     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5695     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5696     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5697     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5698     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5699     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5700     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5701     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5702     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5703     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5704     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5705     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5706     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5707     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5708     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5709     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5710     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5711     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5712     
5713     /* PIXMAN_OP_OVER_REVERSE */
5714     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5715     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5716
5717     /* PIXMAN_OP_ADD */
5718     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5719     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5720     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5721     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5722     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5723     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5724
5725     /* PIXMAN_OP_SRC */
5726     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5727     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5728     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5729     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5730     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5731     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5732     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5733     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5734     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5735     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5736     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5737     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5738     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5739     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5740
5741     /* PIXMAN_OP_IN */
5742     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5743     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5744     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5745
5746     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5747     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5748     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5749     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5750     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5751     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5752     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5753     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5754     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5755     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5756     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5757     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5758
5759     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
5760     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
5761     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
5762     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
5763
5764     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5765     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5766     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
5767
5768     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5769     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5770     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5771     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5772
5773     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
5774     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
5775     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
5776     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
5777
5778     { PIXMAN_OP_NONE },
5779 };
5780
5781 static pixman_bool_t
5782 sse2_blt (pixman_implementation_t *imp,
5783           uint32_t *               src_bits,
5784           uint32_t *               dst_bits,
5785           int                      src_stride,
5786           int                      dst_stride,
5787           int                      src_bpp,
5788           int                      dst_bpp,
5789           int                      src_x,
5790           int                      src_y,
5791           int                      dest_x,
5792           int                      dest_y,
5793           int                      width,
5794           int                      height)
5795 {
5796     if (!pixman_blt_sse2 (
5797             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5798             src_x, src_y, dest_x, dest_y, width, height))
5799
5800     {
5801         return _pixman_implementation_blt (
5802             imp->delegate,
5803             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5804             src_x, src_y, dest_x, dest_y, width, height);
5805     }
5806
5807     return TRUE;
5808 }
5809
5810 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5811 __attribute__((__force_align_arg_pointer__))
5812 #endif
5813 static pixman_bool_t
5814 sse2_fill (pixman_implementation_t *imp,
5815            uint32_t *               bits,
5816            int                      stride,
5817            int                      bpp,
5818            int                      x,
5819            int                      y,
5820            int                      width,
5821            int                      height,
5822            uint32_t xor)
5823 {
5824     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5825     {
5826         return _pixman_implementation_fill (
5827             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5828     }
5829
5830     return TRUE;
5831 }
5832
5833 static uint32_t *
5834 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5835 {
5836     int w = iter->width;
5837     __m128i ff000000 = mask_ff000000;
5838     uint32_t *dst = iter->buffer;
5839     uint32_t *src = (uint32_t *)iter->bits;
5840
5841     iter->bits += iter->stride;
5842
5843     while (w && ((unsigned long)dst) & 0x0f)
5844     {
5845         *dst++ = (*src++) | 0xff000000;
5846         w--;
5847     }
5848
5849     while (w >= 4)
5850     {
5851         save_128_aligned (
5852             (__m128i *)dst, _mm_or_si128 (
5853                 load_128_unaligned ((__m128i *)src), ff000000));
5854
5855         dst += 4;
5856         src += 4;
5857         w -= 4;
5858     }
5859
5860     while (w)
5861     {
5862         *dst++ = (*src++) | 0xff000000;
5863         w--;
5864     }
5865
5866     return iter->buffer;
5867 }
5868
5869 static uint32_t *
5870 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
5871 {
5872     int w = iter->width;
5873     uint32_t *dst = iter->buffer;
5874     uint16_t *src = (uint16_t *)iter->bits;
5875     __m128i ff000000 = mask_ff000000;
5876
5877     iter->bits += iter->stride;
5878
5879     while (w && ((unsigned long)dst) & 0x0f)
5880     {
5881         uint16_t s = *src++;
5882
5883         *dst++ = CONVERT_0565_TO_8888 (s);
5884         w--;
5885     }
5886
5887     while (w >= 8)
5888     {
5889         __m128i lo, hi, s;
5890
5891         s = _mm_loadu_si128 ((__m128i *)src);
5892
5893         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
5894         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
5895
5896         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
5897         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
5898
5899         dst += 8;
5900         src += 8;
5901         w -= 8;
5902     }
5903
5904     while (w)
5905     {
5906         uint16_t s = *src++;
5907
5908         *dst++ = CONVERT_0565_TO_8888 (s);
5909         w--;
5910     }
5911
5912     return iter->buffer;
5913 }
5914
5915 static uint32_t *
5916 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5917 {
5918     int w = iter->width;
5919     uint32_t *dst = iter->buffer;
5920     uint8_t *src = iter->bits;
5921     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5922
5923     iter->bits += iter->stride;
5924
5925     while (w && (((unsigned long)dst) & 15))
5926     {
5927         *dst++ = *(src++) << 24;
5928         w--;
5929     }
5930
5931     while (w >= 16)
5932     {
5933         xmm0 = _mm_loadu_si128((__m128i *)src);
5934
5935         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
5936         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
5937         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
5938         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
5939         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
5940         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
5941
5942         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
5943         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
5944         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
5945         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
5946
5947         dst += 16;
5948         src += 16;
5949         w -= 16;
5950     }
5951
5952     while (w)
5953     {
5954         *dst++ = *(src++) << 24;
5955         w--;
5956     }
5957
5958     return iter->buffer;
5959 }
5960
5961 typedef struct
5962 {
5963     pixman_format_code_t        format;
5964     pixman_iter_get_scanline_t  get_scanline;
5965 } fetcher_info_t;
5966
5967 static const fetcher_info_t fetchers[] =
5968 {
5969     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
5970     { PIXMAN_r5g6b5,            sse2_fetch_r5g6b5 },
5971     { PIXMAN_a8,                sse2_fetch_a8 },
5972     { PIXMAN_null }
5973 };
5974
5975 static void
5976 sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
5977 {
5978     pixman_image_t *image = iter->image;
5979
5980 #define FLAGS                                                           \
5981     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |                \
5982      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
5983
5984     if ((iter->iter_flags & ITER_NARROW)                        &&
5985         (iter->image_flags & FLAGS) == FLAGS)
5986     {
5987         const fetcher_info_t *f;
5988
5989         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
5990         {
5991             if (image->common.extended_format_code == f->format)
5992             {
5993                 uint8_t *b = (uint8_t *)image->bits.bits;
5994                 int s = image->bits.rowstride * 4;
5995
5996                 iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
5997                 iter->stride = s;
5998
5999                 iter->get_scanline = f->get_scanline;
6000                 return;
6001             }
6002         }
6003     }
6004
6005     imp->delegate->src_iter_init (imp->delegate, iter);
6006 }
6007
6008 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6009 __attribute__((__force_align_arg_pointer__))
6010 #endif
6011 pixman_implementation_t *
6012 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6013 {
6014     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6015
6016     /* SSE2 constants */
6017     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6018     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6019     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6020     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6021     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6022     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6023     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6024     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6025     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6026     mask_0080 = create_mask_16_128 (0x0080);
6027     mask_00ff = create_mask_16_128 (0x00ff);
6028     mask_0101 = create_mask_16_128 (0x0101);
6029     mask_ffff = create_mask_16_128 (0xffff);
6030     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6031     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6032
6033     /* Set up function pointers */
6034     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6035     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6036     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6037     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6038     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6039     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6040     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6041     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6042     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6043     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6044
6045     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6046
6047     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6048     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6049     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6050     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6051     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6052     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6053     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6054     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6055     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6056     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6057     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6058
6059     imp->blt = sse2_blt;
6060     imp->fill = sse2_fill;
6061
6062     imp->src_iter_init = sse2_src_iter_init;
6063
6064     return imp;
6065 }