sse2: Delete obsolete or redundant comments
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38 #include "pixman-fast-path.h"
39
40 #if defined(_MSC_VER) && defined(_M_AMD64)
41 /* Windows 64 doesn't allow MMX to be used, so
42  * the pixman-x64-mmx-emulation.h file contains
43  * implementations of those MMX intrinsics that
44  * are used in the SSE2 implementation.
45  */
46 #   include "pixman-x64-mmx-emulation.h"
47 #endif
48
49 static __m128i mask_0080;
50 static __m128i mask_00ff;
51 static __m128i mask_0101;
52 static __m128i mask_ffff;
53 static __m128i mask_ff000000;
54 static __m128i mask_alpha;
55
56 static __m128i mask_565_r;
57 static __m128i mask_565_g1, mask_565_g2;
58 static __m128i mask_565_b;
59 static __m128i mask_red;
60 static __m128i mask_green;
61 static __m128i mask_blue;
62
63 static __m128i mask_565_fix_rb;
64 static __m128i mask_565_fix_g;
65
66 static force_inline __m128i
67 unpack_32_1x128 (uint32_t data)
68 {
69     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
70 }
71
72 static force_inline void
73 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
74 {
75     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
76     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
77 }
78
79 static force_inline __m128i
80 unpack_565_to_8888 (__m128i lo)
81 {
82     __m128i r, g, b, rb, t;
83
84     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
85     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
86     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
87
88     rb = _mm_or_si128 (r, b);
89     t  = _mm_and_si128 (rb, mask_565_fix_rb);
90     t  = _mm_srli_epi32 (t, 5);
91     rb = _mm_or_si128 (rb, t);
92
93     t  = _mm_and_si128 (g, mask_565_fix_g);
94     t  = _mm_srli_epi32 (t, 6);
95     g  = _mm_or_si128 (g, t);
96
97     return _mm_or_si128 (rb, g);
98 }
99
100 static force_inline void
101 unpack_565_128_4x128 (__m128i  data,
102                       __m128i* data0,
103                       __m128i* data1,
104                       __m128i* data2,
105                       __m128i* data3)
106 {
107     __m128i lo, hi;
108
109     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
110     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
111
112     lo = unpack_565_to_8888 (lo);
113     hi = unpack_565_to_8888 (hi);
114
115     unpack_128_2x128 (lo, data0, data1);
116     unpack_128_2x128 (hi, data2, data3);
117 }
118
119 static force_inline uint16_t
120 pack_565_32_16 (uint32_t pixel)
121 {
122     return (uint16_t) (((pixel >> 8) & 0xf800) |
123                        ((pixel >> 5) & 0x07e0) |
124                        ((pixel >> 3) & 0x001f));
125 }
126
127 static force_inline __m128i
128 pack_2x128_128 (__m128i lo, __m128i hi)
129 {
130     return _mm_packus_epi16 (lo, hi);
131 }
132
133 static force_inline __m128i
134 pack_565_2x128_128 (__m128i lo, __m128i hi)
135 {
136     __m128i data;
137     __m128i r, g1, g2, b;
138
139     data = pack_2x128_128 (lo, hi);
140
141     r  = _mm_and_si128 (data, mask_565_r);
142     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
143     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
144     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
145
146     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
147 }
148
149 static force_inline __m128i
150 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
151 {
152     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
153                              pack_565_2x128_128 (*xmm2, *xmm3));
154 }
155
156 static force_inline int
157 is_opaque (__m128i x)
158 {
159     __m128i ffs = _mm_cmpeq_epi8 (x, x);
160
161     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
162 }
163
164 static force_inline int
165 is_zero (__m128i x)
166 {
167     return _mm_movemask_epi8 (
168         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
169 }
170
171 static force_inline int
172 is_transparent (__m128i x)
173 {
174     return (_mm_movemask_epi8 (
175                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
176 }
177
178 static force_inline __m128i
179 expand_pixel_32_1x128 (uint32_t data)
180 {
181     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
182 }
183
184 static force_inline __m128i
185 expand_alpha_1x128 (__m128i data)
186 {
187     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
188                                                      _MM_SHUFFLE (3, 3, 3, 3)),
189                                 _MM_SHUFFLE (3, 3, 3, 3));
190 }
191
192 static force_inline void
193 expand_alpha_2x128 (__m128i  data_lo,
194                     __m128i  data_hi,
195                     __m128i* alpha_lo,
196                     __m128i* alpha_hi)
197 {
198     __m128i lo, hi;
199
200     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
201     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
202
203     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
204     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
205 }
206
207 static force_inline void
208 expand_alpha_rev_2x128 (__m128i  data_lo,
209                         __m128i  data_hi,
210                         __m128i* alpha_lo,
211                         __m128i* alpha_hi)
212 {
213     __m128i lo, hi;
214
215     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
216     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
217     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
218     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
219 }
220
221 static force_inline void
222 pix_multiply_2x128 (__m128i* data_lo,
223                     __m128i* data_hi,
224                     __m128i* alpha_lo,
225                     __m128i* alpha_hi,
226                     __m128i* ret_lo,
227                     __m128i* ret_hi)
228 {
229     __m128i lo, hi;
230
231     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
232     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
233     lo = _mm_adds_epu16 (lo, mask_0080);
234     hi = _mm_adds_epu16 (hi, mask_0080);
235     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
236     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
237 }
238
239 static force_inline void
240 pix_add_multiply_2x128 (__m128i* src_lo,
241                         __m128i* src_hi,
242                         __m128i* alpha_dst_lo,
243                         __m128i* alpha_dst_hi,
244                         __m128i* dst_lo,
245                         __m128i* dst_hi,
246                         __m128i* alpha_src_lo,
247                         __m128i* alpha_src_hi,
248                         __m128i* ret_lo,
249                         __m128i* ret_hi)
250 {
251     __m128i t1_lo, t1_hi;
252     __m128i t2_lo, t2_hi;
253
254     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
255     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
256
257     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
258     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
259 }
260
261 static force_inline void
262 negate_2x128 (__m128i  data_lo,
263               __m128i  data_hi,
264               __m128i* neg_lo,
265               __m128i* neg_hi)
266 {
267     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
268     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
269 }
270
271 static force_inline void
272 invert_colors_2x128 (__m128i  data_lo,
273                      __m128i  data_hi,
274                      __m128i* inv_lo,
275                      __m128i* inv_hi)
276 {
277     __m128i lo, hi;
278
279     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
280     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
281     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
282     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
283 }
284
285 static force_inline void
286 over_2x128 (__m128i* src_lo,
287             __m128i* src_hi,
288             __m128i* alpha_lo,
289             __m128i* alpha_hi,
290             __m128i* dst_lo,
291             __m128i* dst_hi)
292 {
293     __m128i t1, t2;
294
295     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
296
297     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
298
299     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
300     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
301 }
302
303 static force_inline void
304 over_rev_non_pre_2x128 (__m128i  src_lo,
305                         __m128i  src_hi,
306                         __m128i* dst_lo,
307                         __m128i* dst_hi)
308 {
309     __m128i lo, hi;
310     __m128i alpha_lo, alpha_hi;
311
312     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
313
314     lo = _mm_or_si128 (alpha_lo, mask_alpha);
315     hi = _mm_or_si128 (alpha_hi, mask_alpha);
316
317     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
318
319     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
320
321     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
322 }
323
324 static force_inline void
325 in_over_2x128 (__m128i* src_lo,
326                __m128i* src_hi,
327                __m128i* alpha_lo,
328                __m128i* alpha_hi,
329                __m128i* mask_lo,
330                __m128i* mask_hi,
331                __m128i* dst_lo,
332                __m128i* dst_hi)
333 {
334     __m128i s_lo, s_hi;
335     __m128i a_lo, a_hi;
336
337     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
338     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
339
340     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
341 }
342
343 /* load 4 pixels from a 16-byte boundary aligned address */
344 static force_inline __m128i
345 load_128_aligned (__m128i* src)
346 {
347     return _mm_load_si128 (src);
348 }
349
350 /* load 4 pixels from a unaligned address */
351 static force_inline __m128i
352 load_128_unaligned (const __m128i* src)
353 {
354     return _mm_loadu_si128 (src);
355 }
356
357 /* save 4 pixels using Write Combining memory on a 16-byte
358  * boundary aligned address
359  */
360 static force_inline void
361 save_128_write_combining (__m128i* dst,
362                           __m128i  data)
363 {
364     _mm_stream_si128 (dst, data);
365 }
366
367 /* save 4 pixels on a 16-byte boundary aligned address */
368 static force_inline void
369 save_128_aligned (__m128i* dst,
370                   __m128i  data)
371 {
372     _mm_store_si128 (dst, data);
373 }
374
375 /* save 4 pixels on a unaligned address */
376 static force_inline void
377 save_128_unaligned (__m128i* dst,
378                     __m128i  data)
379 {
380     _mm_storeu_si128 (dst, data);
381 }
382
383 static force_inline __m128i
384 load_32_1x128 (uint32_t data)
385 {
386     return _mm_cvtsi32_si128 (data);
387 }
388
389 static force_inline __m128i
390 expand_alpha_rev_1x128 (__m128i data)
391 {
392     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
393 }
394
395 static force_inline __m128i
396 expand_pixel_8_1x128 (uint8_t data)
397 {
398     return _mm_shufflelo_epi16 (
399         unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
400 }
401
402 static force_inline __m128i
403 pix_multiply_1x128 (__m128i data,
404                     __m128i alpha)
405 {
406     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
407                                             mask_0080),
408                             mask_0101);
409 }
410
411 static force_inline __m128i
412 pix_add_multiply_1x128 (__m128i* src,
413                         __m128i* alpha_dst,
414                         __m128i* dst,
415                         __m128i* alpha_src)
416 {
417     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
418     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
419
420     return _mm_adds_epu8 (t1, t2);
421 }
422
423 static force_inline __m128i
424 negate_1x128 (__m128i data)
425 {
426     return _mm_xor_si128 (data, mask_00ff);
427 }
428
429 static force_inline __m128i
430 invert_colors_1x128 (__m128i data)
431 {
432     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
433 }
434
435 static force_inline __m128i
436 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
437 {
438     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
439 }
440
441 static force_inline __m128i
442 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
443 {
444     return over_1x128 (pix_multiply_1x128 (*src, *mask),
445                        pix_multiply_1x128 (*alpha, *mask),
446                        *dst);
447 }
448
449 static force_inline __m128i
450 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
451 {
452     __m128i alpha = expand_alpha_1x128 (src);
453
454     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
455                                            _mm_or_si128 (alpha, mask_alpha)),
456                        alpha,
457                        dst);
458 }
459
460 static force_inline uint32_t
461 pack_1x128_32 (__m128i data)
462 {
463     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
464 }
465
466 static force_inline __m128i
467 expand565_16_1x128 (uint16_t pixel)
468 {
469     __m128i m = _mm_cvtsi32_si128 (pixel);
470
471     m = unpack_565_to_8888 (m);
472
473     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
474 }
475
476 static force_inline uint32_t
477 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
478 {
479     uint8_t a;
480     __m128i xmms;
481
482     a = src >> 24;
483
484     if (a == 0xff)
485     {
486         return src;
487     }
488     else if (src)
489     {
490         xmms = unpack_32_1x128 (src);
491         return pack_1x128_32 (
492             over_1x128 (xmms, expand_alpha_1x128 (xmms),
493                         unpack_32_1x128 (dst)));
494     }
495
496     return dst;
497 }
498
499 static force_inline uint32_t
500 combine1 (const uint32_t *ps, const uint32_t *pm)
501 {
502     uint32_t s = *ps;
503
504     if (pm)
505     {
506         __m128i ms, mm;
507
508         mm = unpack_32_1x128 (*pm);
509         mm = expand_alpha_1x128 (mm);
510
511         ms = unpack_32_1x128 (s);
512         ms = pix_multiply_1x128 (ms, mm);
513
514         s = pack_1x128_32 (ms);
515     }
516
517     return s;
518 }
519
520 static force_inline __m128i
521 combine4 (const __m128i *ps, const __m128i *pm)
522 {
523     __m128i xmm_src_lo, xmm_src_hi;
524     __m128i xmm_msk_lo, xmm_msk_hi;
525     __m128i s;
526
527     if (pm)
528     {
529         xmm_msk_lo = load_128_unaligned (pm);
530
531         if (is_transparent (xmm_msk_lo))
532             return _mm_setzero_si128 ();
533     }
534
535     s = load_128_unaligned (ps);
536
537     if (pm)
538     {
539         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
540         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
541
542         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
543
544         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
545                             &xmm_msk_lo, &xmm_msk_hi,
546                             &xmm_src_lo, &xmm_src_hi);
547
548         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
549     }
550
551     return s;
552 }
553
554 static force_inline void
555 core_combine_over_u_sse2_mask (uint32_t *         pd,
556                                const uint32_t*    ps,
557                                const uint32_t*    pm,
558                                int                w)
559 {
560     uint32_t s, d;
561
562     /* Align dst on a 16-byte boundary */
563     while (w && ((unsigned long)pd & 15))
564     {
565         d = *pd;
566         s = combine1 (ps, pm);
567
568         if (s)
569             *pd = core_combine_over_u_pixel_sse2 (s, d);
570         pd++;
571         ps++;
572         pm++;
573         w--;
574     }
575
576     while (w >= 4)
577     {
578         __m128i mask = load_128_unaligned ((__m128i *)pm);
579
580         if (!is_zero (mask))
581         {
582             __m128i src;
583             __m128i src_hi, src_lo;
584             __m128i mask_hi, mask_lo;
585             __m128i alpha_hi, alpha_lo;
586
587             src = load_128_unaligned ((__m128i *)ps);
588
589             if (is_opaque (_mm_and_si128 (src, mask)))
590             {
591                 save_128_aligned ((__m128i *)pd, src);
592             }
593             else
594             {
595                 __m128i dst = load_128_aligned ((__m128i *)pd);
596                 __m128i dst_hi, dst_lo;
597
598                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
599                 unpack_128_2x128 (src, &src_lo, &src_hi);
600
601                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
602                 pix_multiply_2x128 (&src_lo, &src_hi,
603                                     &mask_lo, &mask_hi,
604                                     &src_lo, &src_hi);
605
606                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
607
608                 expand_alpha_2x128 (src_lo, src_hi,
609                                     &alpha_lo, &alpha_hi);
610
611                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
612                             &dst_lo, &dst_hi);
613
614                 save_128_aligned (
615                     (__m128i *)pd,
616                     pack_2x128_128 (dst_lo, dst_hi));
617             }
618         }
619
620         pm += 4;
621         ps += 4;
622         pd += 4;
623         w -= 4;
624     }
625     while (w)
626     {
627         d = *pd;
628         s = combine1 (ps, pm);
629
630         if (s)
631             *pd = core_combine_over_u_pixel_sse2 (s, d);
632         pd++;
633         ps++;
634         pm++;
635
636         w--;
637     }
638 }
639
640 static force_inline void
641 core_combine_over_u_sse2_no_mask (uint32_t *      pd,
642                                   const uint32_t*    ps,
643                                   int                w)
644 {
645     uint32_t s, d;
646
647     /* Align dst on a 16-byte boundary */
648     while (w && ((unsigned long)pd & 15))
649     {
650         d = *pd;
651         s = *ps;
652
653         if (s)
654             *pd = core_combine_over_u_pixel_sse2 (s, d);
655         pd++;
656         ps++;
657         w--;
658     }
659
660     while (w >= 4)
661     {
662         __m128i src;
663         __m128i src_hi, src_lo, dst_hi, dst_lo;
664         __m128i alpha_hi, alpha_lo;
665
666         src = load_128_unaligned ((__m128i *)ps);
667
668         if (!is_zero (src))
669         {
670             if (is_opaque (src))
671             {
672                 save_128_aligned ((__m128i *)pd, src);
673             }
674             else
675             {
676                 __m128i dst = load_128_aligned ((__m128i *)pd);
677
678                 unpack_128_2x128 (src, &src_lo, &src_hi);
679                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
680
681                 expand_alpha_2x128 (src_lo, src_hi,
682                                     &alpha_lo, &alpha_hi);
683                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
684                             &dst_lo, &dst_hi);
685
686                 save_128_aligned (
687                     (__m128i *)pd,
688                     pack_2x128_128 (dst_lo, dst_hi));
689             }
690         }
691
692         ps += 4;
693         pd += 4;
694         w -= 4;
695     }
696     while (w)
697     {
698         d = *pd;
699         s = *ps;
700
701         if (s)
702             *pd = core_combine_over_u_pixel_sse2 (s, d);
703         pd++;
704         ps++;
705
706         w--;
707     }
708 }
709
710 static force_inline void
711 sse2_combine_over_u (pixman_implementation_t *imp,
712                      pixman_op_t              op,
713                      uint32_t *               pd,
714                      const uint32_t *         ps,
715                      const uint32_t *         pm,
716                      int                      w)
717 {
718     if (pm)
719         core_combine_over_u_sse2_mask (pd, ps, pm, w);
720     else
721         core_combine_over_u_sse2_no_mask (pd, ps, w);
722 }
723
724 static void
725 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
726                              pixman_op_t              op,
727                              uint32_t *               pd,
728                              const uint32_t *         ps,
729                              const uint32_t *         pm,
730                              int                      w)
731 {
732     uint32_t s, d;
733
734     __m128i xmm_dst_lo, xmm_dst_hi;
735     __m128i xmm_src_lo, xmm_src_hi;
736     __m128i xmm_alpha_lo, xmm_alpha_hi;
737
738     /* Align dst on a 16-byte boundary */
739     while (w &&
740            ((unsigned long)pd & 15))
741     {
742         d = *pd;
743         s = combine1 (ps, pm);
744
745         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
746         w--;
747         ps++;
748         if (pm)
749             pm++;
750     }
751
752     while (w >= 4)
753     {
754         /* I'm loading unaligned because I'm not sure
755          * about the address alignment.
756          */
757         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
758         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
759
760         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
761         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
762
763         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
764                             &xmm_alpha_lo, &xmm_alpha_hi);
765
766         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
767                     &xmm_alpha_lo, &xmm_alpha_hi,
768                     &xmm_src_lo, &xmm_src_hi);
769
770         /* rebuid the 4 pixel data and save*/
771         save_128_aligned ((__m128i*)pd,
772                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
773
774         w -= 4;
775         ps += 4;
776         pd += 4;
777
778         if (pm)
779             pm += 4;
780     }
781
782     while (w)
783     {
784         d = *pd;
785         s = combine1 (ps, pm);
786
787         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
788         ps++;
789         w--;
790         if (pm)
791             pm++;
792     }
793 }
794
795 static force_inline uint32_t
796 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
797 {
798     uint32_t maska = src >> 24;
799
800     if (maska == 0)
801     {
802         return 0;
803     }
804     else if (maska != 0xff)
805     {
806         return pack_1x128_32 (
807             pix_multiply_1x128 (unpack_32_1x128 (dst),
808                                 expand_alpha_1x128 (unpack_32_1x128 (src))));
809     }
810
811     return dst;
812 }
813
814 static void
815 sse2_combine_in_u (pixman_implementation_t *imp,
816                    pixman_op_t              op,
817                    uint32_t *               pd,
818                    const uint32_t *         ps,
819                    const uint32_t *         pm,
820                    int                      w)
821 {
822     uint32_t s, d;
823
824     __m128i xmm_src_lo, xmm_src_hi;
825     __m128i xmm_dst_lo, xmm_dst_hi;
826
827     while (w && ((unsigned long) pd & 15))
828     {
829         s = combine1 (ps, pm);
830         d = *pd;
831
832         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
833         w--;
834         ps++;
835         if (pm)
836             pm++;
837     }
838
839     while (w >= 4)
840     {
841         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
842         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
843
844         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
845         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
846
847         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
848         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
849                             &xmm_dst_lo, &xmm_dst_hi,
850                             &xmm_dst_lo, &xmm_dst_hi);
851
852         save_128_aligned ((__m128i*)pd,
853                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
854
855         ps += 4;
856         pd += 4;
857         w -= 4;
858         if (pm)
859             pm += 4;
860     }
861
862     while (w)
863     {
864         s = combine1 (ps, pm);
865         d = *pd;
866
867         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
868         w--;
869         ps++;
870         if (pm)
871             pm++;
872     }
873 }
874
875 static void
876 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
877                            pixman_op_t              op,
878                            uint32_t *               pd,
879                            const uint32_t *         ps,
880                            const uint32_t *         pm,
881                            int                      w)
882 {
883     uint32_t s, d;
884
885     __m128i xmm_src_lo, xmm_src_hi;
886     __m128i xmm_dst_lo, xmm_dst_hi;
887
888     while (w && ((unsigned long) pd & 15))
889     {
890         s = combine1 (ps, pm);
891         d = *pd;
892
893         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
894         ps++;
895         w--;
896         if (pm)
897             pm++;
898     }
899
900     while (w >= 4)
901     {
902         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
903         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
904
905         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
906         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
907
908         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
909         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
910                             &xmm_src_lo, &xmm_src_hi,
911                             &xmm_dst_lo, &xmm_dst_hi);
912
913         save_128_aligned (
914             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
915
916         ps += 4;
917         pd += 4;
918         w -= 4;
919         if (pm)
920             pm += 4;
921     }
922
923     while (w)
924     {
925         s = combine1 (ps, pm);
926         d = *pd;
927
928         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
929         w--;
930         ps++;
931         if (pm)
932             pm++;
933     }
934 }
935
936 static void
937 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
938                             pixman_op_t              op,
939                             uint32_t *               pd,
940                             const uint32_t *         ps,
941                             const uint32_t *         pm,
942                             int                      w)
943 {
944     while (w && ((unsigned long) pd & 15))
945     {
946         uint32_t s = combine1 (ps, pm);
947         uint32_t d = *pd;
948
949         *pd++ = pack_1x128_32 (
950             pix_multiply_1x128 (
951                 unpack_32_1x128 (d), negate_1x128 (
952                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
953
954         if (pm)
955             pm++;
956         ps++;
957         w--;
958     }
959
960     while (w >= 4)
961     {
962         __m128i xmm_src_lo, xmm_src_hi;
963         __m128i xmm_dst_lo, xmm_dst_hi;
964
965         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
966         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
967
968         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
969         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
970
971         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
972         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
973
974         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
975                             &xmm_src_lo, &xmm_src_hi,
976                             &xmm_dst_lo, &xmm_dst_hi);
977
978         save_128_aligned (
979             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
980
981         ps += 4;
982         pd += 4;
983         if (pm)
984             pm += 4;
985
986         w -= 4;
987     }
988
989     while (w)
990     {
991         uint32_t s = combine1 (ps, pm);
992         uint32_t d = *pd;
993
994         *pd++ = pack_1x128_32 (
995             pix_multiply_1x128 (
996                 unpack_32_1x128 (d), negate_1x128 (
997                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
998         ps++;
999         if (pm)
1000             pm++;
1001         w--;
1002     }
1003 }
1004
1005 static void
1006 sse2_combine_out_u (pixman_implementation_t *imp,
1007                     pixman_op_t              op,
1008                     uint32_t *               pd,
1009                     const uint32_t *         ps,
1010                     const uint32_t *         pm,
1011                     int                      w)
1012 {
1013     while (w && ((unsigned long) pd & 15))
1014     {
1015         uint32_t s = combine1 (ps, pm);
1016         uint32_t d = *pd;
1017
1018         *pd++ = pack_1x128_32 (
1019             pix_multiply_1x128 (
1020                 unpack_32_1x128 (s), negate_1x128 (
1021                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1022         w--;
1023         ps++;
1024         if (pm)
1025             pm++;
1026     }
1027
1028     while (w >= 4)
1029     {
1030         __m128i xmm_src_lo, xmm_src_hi;
1031         __m128i xmm_dst_lo, xmm_dst_hi;
1032
1033         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1034         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1035
1036         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1037         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1038
1039         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1040         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1041
1042         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1043                             &xmm_dst_lo, &xmm_dst_hi,
1044                             &xmm_dst_lo, &xmm_dst_hi);
1045
1046         save_128_aligned (
1047             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1048
1049         ps += 4;
1050         pd += 4;
1051         w -= 4;
1052         if (pm)
1053             pm += 4;
1054     }
1055
1056     while (w)
1057     {
1058         uint32_t s = combine1 (ps, pm);
1059         uint32_t d = *pd;
1060
1061         *pd++ = pack_1x128_32 (
1062             pix_multiply_1x128 (
1063                 unpack_32_1x128 (s), negate_1x128 (
1064                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1065         w--;
1066         ps++;
1067         if (pm)
1068             pm++;
1069     }
1070 }
1071
1072 static force_inline uint32_t
1073 core_combine_atop_u_pixel_sse2 (uint32_t src,
1074                                 uint32_t dst)
1075 {
1076     __m128i s = unpack_32_1x128 (src);
1077     __m128i d = unpack_32_1x128 (dst);
1078
1079     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1080     __m128i da = expand_alpha_1x128 (d);
1081
1082     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1083 }
1084
1085 static void
1086 sse2_combine_atop_u (pixman_implementation_t *imp,
1087                      pixman_op_t              op,
1088                      uint32_t *               pd,
1089                      const uint32_t *         ps,
1090                      const uint32_t *         pm,
1091                      int                      w)
1092 {
1093     uint32_t s, d;
1094
1095     __m128i xmm_src_lo, xmm_src_hi;
1096     __m128i xmm_dst_lo, xmm_dst_hi;
1097     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1098     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1099
1100     while (w && ((unsigned long) pd & 15))
1101     {
1102         s = combine1 (ps, pm);
1103         d = *pd;
1104
1105         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1106         w--;
1107         ps++;
1108         if (pm)
1109             pm++;
1110     }
1111
1112     while (w >= 4)
1113     {
1114         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1115         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1116
1117         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1118         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1119
1120         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1121                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1122         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1123                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1124
1125         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1126                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1127
1128         pix_add_multiply_2x128 (
1129             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1130             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1131             &xmm_dst_lo, &xmm_dst_hi);
1132
1133         save_128_aligned (
1134             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1135
1136         ps += 4;
1137         pd += 4;
1138         w -= 4;
1139         if (pm)
1140             pm += 4;
1141     }
1142
1143     while (w)
1144     {
1145         s = combine1 (ps, pm);
1146         d = *pd;
1147
1148         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1149         w--;
1150         ps++;
1151         if (pm)
1152             pm++;
1153     }
1154 }
1155
1156 static force_inline uint32_t
1157 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1158                                         uint32_t dst)
1159 {
1160     __m128i s = unpack_32_1x128 (src);
1161     __m128i d = unpack_32_1x128 (dst);
1162
1163     __m128i sa = expand_alpha_1x128 (s);
1164     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1165
1166     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1167 }
1168
1169 static void
1170 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1171                              pixman_op_t              op,
1172                              uint32_t *               pd,
1173                              const uint32_t *         ps,
1174                              const uint32_t *         pm,
1175                              int                      w)
1176 {
1177     uint32_t s, d;
1178
1179     __m128i xmm_src_lo, xmm_src_hi;
1180     __m128i xmm_dst_lo, xmm_dst_hi;
1181     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1182     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1183
1184     while (w && ((unsigned long) pd & 15))
1185     {
1186         s = combine1 (ps, pm);
1187         d = *pd;
1188
1189         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1190         ps++;
1191         w--;
1192         if (pm)
1193             pm++;
1194     }
1195
1196     while (w >= 4)
1197     {
1198         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1199         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1200
1201         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1202         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1203
1204         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1205                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1206         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1207                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1208
1209         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1210                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1211
1212         pix_add_multiply_2x128 (
1213             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1214             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1215             &xmm_dst_lo, &xmm_dst_hi);
1216
1217         save_128_aligned (
1218             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1219
1220         ps += 4;
1221         pd += 4;
1222         w -= 4;
1223         if (pm)
1224             pm += 4;
1225     }
1226
1227     while (w)
1228     {
1229         s = combine1 (ps, pm);
1230         d = *pd;
1231
1232         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1233         ps++;
1234         w--;
1235         if (pm)
1236             pm++;
1237     }
1238 }
1239
1240 static force_inline uint32_t
1241 core_combine_xor_u_pixel_sse2 (uint32_t src,
1242                                uint32_t dst)
1243 {
1244     __m128i s = unpack_32_1x128 (src);
1245     __m128i d = unpack_32_1x128 (dst);
1246
1247     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1248     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1249
1250     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1251 }
1252
1253 static void
1254 sse2_combine_xor_u (pixman_implementation_t *imp,
1255                     pixman_op_t              op,
1256                     uint32_t *               dst,
1257                     const uint32_t *         src,
1258                     const uint32_t *         mask,
1259                     int                      width)
1260 {
1261     int w = width;
1262     uint32_t s, d;
1263     uint32_t* pd = dst;
1264     const uint32_t* ps = src;
1265     const uint32_t* pm = mask;
1266
1267     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1268     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1269     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1270     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1271
1272     while (w && ((unsigned long) pd & 15))
1273     {
1274         s = combine1 (ps, pm);
1275         d = *pd;
1276
1277         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1278         w--;
1279         ps++;
1280         if (pm)
1281             pm++;
1282     }
1283
1284     while (w >= 4)
1285     {
1286         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1287         xmm_dst = load_128_aligned ((__m128i*) pd);
1288
1289         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1290         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1291
1292         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1293                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1294         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1295                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1296
1297         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1298                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1299         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1300                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1301
1302         pix_add_multiply_2x128 (
1303             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1304             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1305             &xmm_dst_lo, &xmm_dst_hi);
1306
1307         save_128_aligned (
1308             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1309
1310         ps += 4;
1311         pd += 4;
1312         w -= 4;
1313         if (pm)
1314             pm += 4;
1315     }
1316
1317     while (w)
1318     {
1319         s = combine1 (ps, pm);
1320         d = *pd;
1321
1322         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1323         w--;
1324         ps++;
1325         if (pm)
1326             pm++;
1327     }
1328 }
1329
1330 static force_inline void
1331 sse2_combine_add_u (pixman_implementation_t *imp,
1332                     pixman_op_t              op,
1333                     uint32_t *               dst,
1334                     const uint32_t *         src,
1335                     const uint32_t *         mask,
1336                     int                      width)
1337 {
1338     int w = width;
1339     uint32_t s, d;
1340     uint32_t* pd = dst;
1341     const uint32_t* ps = src;
1342     const uint32_t* pm = mask;
1343
1344     while (w && (unsigned long)pd & 15)
1345     {
1346         s = combine1 (ps, pm);
1347         d = *pd;
1348
1349         ps++;
1350         if (pm)
1351             pm++;
1352         *pd++ = _mm_cvtsi128_si32 (
1353             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1354         w--;
1355     }
1356
1357     while (w >= 4)
1358     {
1359         __m128i s;
1360
1361         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1362
1363         save_128_aligned (
1364             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1365
1366         pd += 4;
1367         ps += 4;
1368         if (pm)
1369             pm += 4;
1370         w -= 4;
1371     }
1372
1373     while (w--)
1374     {
1375         s = combine1 (ps, pm);
1376         d = *pd;
1377
1378         ps++;
1379         *pd++ = _mm_cvtsi128_si32 (
1380             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1381         if (pm)
1382             pm++;
1383     }
1384 }
1385
1386 static force_inline uint32_t
1387 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1388                                     uint32_t dst)
1389 {
1390     __m128i ms = unpack_32_1x128 (src);
1391     __m128i md = unpack_32_1x128 (dst);
1392     uint32_t sa = src >> 24;
1393     uint32_t da = ~dst >> 24;
1394
1395     if (sa > da)
1396     {
1397         ms = pix_multiply_1x128 (
1398             ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1399     }
1400
1401     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1402 }
1403
1404 static void
1405 sse2_combine_saturate_u (pixman_implementation_t *imp,
1406                          pixman_op_t              op,
1407                          uint32_t *               pd,
1408                          const uint32_t *         ps,
1409                          const uint32_t *         pm,
1410                          int                      w)
1411 {
1412     uint32_t s, d;
1413
1414     uint32_t pack_cmp;
1415     __m128i xmm_src, xmm_dst;
1416
1417     while (w && (unsigned long)pd & 15)
1418     {
1419         s = combine1 (ps, pm);
1420         d = *pd;
1421
1422         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1423         w--;
1424         ps++;
1425         if (pm)
1426             pm++;
1427     }
1428
1429     while (w >= 4)
1430     {
1431         xmm_dst = load_128_aligned  ((__m128i*)pd);
1432         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1433
1434         pack_cmp = _mm_movemask_epi8 (
1435             _mm_cmpgt_epi32 (
1436                 _mm_srli_epi32 (xmm_src, 24),
1437                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1438
1439         /* if some alpha src is grater than respective ~alpha dst */
1440         if (pack_cmp)
1441         {
1442             s = combine1 (ps++, pm);
1443             d = *pd;
1444             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1445             if (pm)
1446                 pm++;
1447
1448             s = combine1 (ps++, pm);
1449             d = *pd;
1450             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1451             if (pm)
1452                 pm++;
1453
1454             s = combine1 (ps++, pm);
1455             d = *pd;
1456             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1457             if (pm)
1458                 pm++;
1459
1460             s = combine1 (ps++, pm);
1461             d = *pd;
1462             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1463             if (pm)
1464                 pm++;
1465         }
1466         else
1467         {
1468             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1469
1470             pd += 4;
1471             ps += 4;
1472             if (pm)
1473                 pm += 4;
1474         }
1475
1476         w -= 4;
1477     }
1478
1479     while (w--)
1480     {
1481         s = combine1 (ps, pm);
1482         d = *pd;
1483
1484         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1485         ps++;
1486         if (pm)
1487             pm++;
1488     }
1489 }
1490
1491 static void
1492 sse2_combine_src_ca (pixman_implementation_t *imp,
1493                      pixman_op_t              op,
1494                      uint32_t *               pd,
1495                      const uint32_t *         ps,
1496                      const uint32_t *         pm,
1497                      int                      w)
1498 {
1499     uint32_t s, m;
1500
1501     __m128i xmm_src_lo, xmm_src_hi;
1502     __m128i xmm_mask_lo, xmm_mask_hi;
1503     __m128i xmm_dst_lo, xmm_dst_hi;
1504
1505     while (w && (unsigned long)pd & 15)
1506     {
1507         s = *ps++;
1508         m = *pm++;
1509         *pd++ = pack_1x128_32 (
1510             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1511         w--;
1512     }
1513
1514     while (w >= 4)
1515     {
1516         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1517         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1518
1519         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1520         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1521
1522         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1523                             &xmm_mask_lo, &xmm_mask_hi,
1524                             &xmm_dst_lo, &xmm_dst_hi);
1525
1526         save_128_aligned (
1527             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1528
1529         ps += 4;
1530         pd += 4;
1531         pm += 4;
1532         w -= 4;
1533     }
1534
1535     while (w)
1536     {
1537         s = *ps++;
1538         m = *pm++;
1539         *pd++ = pack_1x128_32 (
1540             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1541         w--;
1542     }
1543 }
1544
1545 static force_inline uint32_t
1546 core_combine_over_ca_pixel_sse2 (uint32_t src,
1547                                  uint32_t mask,
1548                                  uint32_t dst)
1549 {
1550     __m128i s = unpack_32_1x128 (src);
1551     __m128i expAlpha = expand_alpha_1x128 (s);
1552     __m128i unpk_mask = unpack_32_1x128 (mask);
1553     __m128i unpk_dst  = unpack_32_1x128 (dst);
1554
1555     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1556 }
1557
1558 static void
1559 sse2_combine_over_ca (pixman_implementation_t *imp,
1560                       pixman_op_t              op,
1561                       uint32_t *               pd,
1562                       const uint32_t *         ps,
1563                       const uint32_t *         pm,
1564                       int                      w)
1565 {
1566     uint32_t s, m, d;
1567
1568     __m128i xmm_alpha_lo, xmm_alpha_hi;
1569     __m128i xmm_src_lo, xmm_src_hi;
1570     __m128i xmm_dst_lo, xmm_dst_hi;
1571     __m128i xmm_mask_lo, xmm_mask_hi;
1572
1573     while (w && (unsigned long)pd & 15)
1574     {
1575         s = *ps++;
1576         m = *pm++;
1577         d = *pd;
1578
1579         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1580         w--;
1581     }
1582
1583     while (w >= 4)
1584     {
1585         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1586         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1587         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1588
1589         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1590         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1591         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1592
1593         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1594                             &xmm_alpha_lo, &xmm_alpha_hi);
1595
1596         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1597                        &xmm_alpha_lo, &xmm_alpha_hi,
1598                        &xmm_mask_lo, &xmm_mask_hi,
1599                        &xmm_dst_lo, &xmm_dst_hi);
1600
1601         save_128_aligned (
1602             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1603
1604         ps += 4;
1605         pd += 4;
1606         pm += 4;
1607         w -= 4;
1608     }
1609
1610     while (w)
1611     {
1612         s = *ps++;
1613         m = *pm++;
1614         d = *pd;
1615
1616         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1617         w--;
1618     }
1619 }
1620
1621 static force_inline uint32_t
1622 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1623                                          uint32_t mask,
1624                                          uint32_t dst)
1625 {
1626     __m128i d = unpack_32_1x128 (dst);
1627
1628     return pack_1x128_32 (
1629         over_1x128 (d, expand_alpha_1x128 (d),
1630                     pix_multiply_1x128 (unpack_32_1x128 (src),
1631                                         unpack_32_1x128 (mask))));
1632 }
1633
1634 static void
1635 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1636                               pixman_op_t              op,
1637                               uint32_t *               pd,
1638                               const uint32_t *         ps,
1639                               const uint32_t *         pm,
1640                               int                      w)
1641 {
1642     uint32_t s, m, d;
1643
1644     __m128i xmm_alpha_lo, xmm_alpha_hi;
1645     __m128i xmm_src_lo, xmm_src_hi;
1646     __m128i xmm_dst_lo, xmm_dst_hi;
1647     __m128i xmm_mask_lo, xmm_mask_hi;
1648
1649     while (w && (unsigned long)pd & 15)
1650     {
1651         s = *ps++;
1652         m = *pm++;
1653         d = *pd;
1654
1655         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1656         w--;
1657     }
1658
1659     while (w >= 4)
1660     {
1661         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1662         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1663         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1664
1665         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1666         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1667         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1668
1669         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1670                             &xmm_alpha_lo, &xmm_alpha_hi);
1671         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1672                             &xmm_mask_lo, &xmm_mask_hi,
1673                             &xmm_mask_lo, &xmm_mask_hi);
1674
1675         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1676                     &xmm_alpha_lo, &xmm_alpha_hi,
1677                     &xmm_mask_lo, &xmm_mask_hi);
1678
1679         save_128_aligned (
1680             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1681
1682         ps += 4;
1683         pd += 4;
1684         pm += 4;
1685         w -= 4;
1686     }
1687
1688     while (w)
1689     {
1690         s = *ps++;
1691         m = *pm++;
1692         d = *pd;
1693
1694         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1695         w--;
1696     }
1697 }
1698
1699 static void
1700 sse2_combine_in_ca (pixman_implementation_t *imp,
1701                     pixman_op_t              op,
1702                     uint32_t *               pd,
1703                     const uint32_t *         ps,
1704                     const uint32_t *         pm,
1705                     int                      w)
1706 {
1707     uint32_t s, m, d;
1708
1709     __m128i xmm_alpha_lo, xmm_alpha_hi;
1710     __m128i xmm_src_lo, xmm_src_hi;
1711     __m128i xmm_dst_lo, xmm_dst_hi;
1712     __m128i xmm_mask_lo, xmm_mask_hi;
1713
1714     while (w && (unsigned long)pd & 15)
1715     {
1716         s = *ps++;
1717         m = *pm++;
1718         d = *pd;
1719
1720         *pd++ = pack_1x128_32 (
1721             pix_multiply_1x128 (
1722                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1723                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1724
1725         w--;
1726     }
1727
1728     while (w >= 4)
1729     {
1730         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1731         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1732         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1733
1734         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1735         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1736         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1737
1738         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1739                             &xmm_alpha_lo, &xmm_alpha_hi);
1740
1741         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1742                             &xmm_mask_lo, &xmm_mask_hi,
1743                             &xmm_dst_lo, &xmm_dst_hi);
1744
1745         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1746                             &xmm_alpha_lo, &xmm_alpha_hi,
1747                             &xmm_dst_lo, &xmm_dst_hi);
1748
1749         save_128_aligned (
1750             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1751
1752         ps += 4;
1753         pd += 4;
1754         pm += 4;
1755         w -= 4;
1756     }
1757
1758     while (w)
1759     {
1760         s = *ps++;
1761         m = *pm++;
1762         d = *pd;
1763
1764         *pd++ = pack_1x128_32 (
1765             pix_multiply_1x128 (
1766                 pix_multiply_1x128 (
1767                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1768                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1769
1770         w--;
1771     }
1772 }
1773
1774 static void
1775 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1776                             pixman_op_t              op,
1777                             uint32_t *               pd,
1778                             const uint32_t *         ps,
1779                             const uint32_t *         pm,
1780                             int                      w)
1781 {
1782     uint32_t s, m, d;
1783
1784     __m128i xmm_alpha_lo, xmm_alpha_hi;
1785     __m128i xmm_src_lo, xmm_src_hi;
1786     __m128i xmm_dst_lo, xmm_dst_hi;
1787     __m128i xmm_mask_lo, xmm_mask_hi;
1788
1789     while (w && (unsigned long)pd & 15)
1790     {
1791         s = *ps++;
1792         m = *pm++;
1793         d = *pd;
1794
1795         *pd++ = pack_1x128_32 (
1796             pix_multiply_1x128 (
1797                 unpack_32_1x128 (d),
1798                 pix_multiply_1x128 (unpack_32_1x128 (m),
1799                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1800         w--;
1801     }
1802
1803     while (w >= 4)
1804     {
1805         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1806         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1807         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1808
1809         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1810         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1811         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1812
1813         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1814                             &xmm_alpha_lo, &xmm_alpha_hi);
1815         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1816                             &xmm_alpha_lo, &xmm_alpha_hi,
1817                             &xmm_alpha_lo, &xmm_alpha_hi);
1818
1819         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1820                             &xmm_alpha_lo, &xmm_alpha_hi,
1821                             &xmm_dst_lo, &xmm_dst_hi);
1822
1823         save_128_aligned (
1824             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1825
1826         ps += 4;
1827         pd += 4;
1828         pm += 4;
1829         w -= 4;
1830     }
1831
1832     while (w)
1833     {
1834         s = *ps++;
1835         m = *pm++;
1836         d = *pd;
1837
1838         *pd++ = pack_1x128_32 (
1839             pix_multiply_1x128 (
1840                 unpack_32_1x128 (d),
1841                 pix_multiply_1x128 (unpack_32_1x128 (m),
1842                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1843         w--;
1844     }
1845 }
1846
1847 static void
1848 sse2_combine_out_ca (pixman_implementation_t *imp,
1849                      pixman_op_t              op,
1850                      uint32_t *               pd,
1851                      const uint32_t *         ps,
1852                      const uint32_t *         pm,
1853                      int                      w)
1854 {
1855     uint32_t s, m, d;
1856
1857     __m128i xmm_alpha_lo, xmm_alpha_hi;
1858     __m128i xmm_src_lo, xmm_src_hi;
1859     __m128i xmm_dst_lo, xmm_dst_hi;
1860     __m128i xmm_mask_lo, xmm_mask_hi;
1861
1862     while (w && (unsigned long)pd & 15)
1863     {
1864         s = *ps++;
1865         m = *pm++;
1866         d = *pd;
1867
1868         *pd++ = pack_1x128_32 (
1869             pix_multiply_1x128 (
1870                 pix_multiply_1x128 (
1871                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1872                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1873         w--;
1874     }
1875
1876     while (w >= 4)
1877     {
1878         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1879         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1880         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1881
1882         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1883         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1884         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1885
1886         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1887                             &xmm_alpha_lo, &xmm_alpha_hi);
1888         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1889                       &xmm_alpha_lo, &xmm_alpha_hi);
1890
1891         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1892                             &xmm_mask_lo, &xmm_mask_hi,
1893                             &xmm_dst_lo, &xmm_dst_hi);
1894         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1895                             &xmm_alpha_lo, &xmm_alpha_hi,
1896                             &xmm_dst_lo, &xmm_dst_hi);
1897
1898         save_128_aligned (
1899             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1900
1901         ps += 4;
1902         pd += 4;
1903         pm += 4;
1904         w -= 4;
1905     }
1906
1907     while (w)
1908     {
1909         s = *ps++;
1910         m = *pm++;
1911         d = *pd;
1912
1913         *pd++ = pack_1x128_32 (
1914             pix_multiply_1x128 (
1915                 pix_multiply_1x128 (
1916                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1917                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1918
1919         w--;
1920     }
1921 }
1922
1923 static void
1924 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1925                              pixman_op_t              op,
1926                              uint32_t *               pd,
1927                              const uint32_t *         ps,
1928                              const uint32_t *         pm,
1929                              int                      w)
1930 {
1931     uint32_t s, m, d;
1932
1933     __m128i xmm_alpha_lo, xmm_alpha_hi;
1934     __m128i xmm_src_lo, xmm_src_hi;
1935     __m128i xmm_dst_lo, xmm_dst_hi;
1936     __m128i xmm_mask_lo, xmm_mask_hi;
1937
1938     while (w && (unsigned long)pd & 15)
1939     {
1940         s = *ps++;
1941         m = *pm++;
1942         d = *pd;
1943
1944         *pd++ = pack_1x128_32 (
1945             pix_multiply_1x128 (
1946                 unpack_32_1x128 (d),
1947                 negate_1x128 (pix_multiply_1x128 (
1948                                  unpack_32_1x128 (m),
1949                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1950         w--;
1951     }
1952
1953     while (w >= 4)
1954     {
1955         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1956         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1957         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1958
1959         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1960         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1961         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1962
1963         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1964                             &xmm_alpha_lo, &xmm_alpha_hi);
1965
1966         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1967                             &xmm_alpha_lo, &xmm_alpha_hi,
1968                             &xmm_mask_lo, &xmm_mask_hi);
1969
1970         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1971                       &xmm_mask_lo, &xmm_mask_hi);
1972
1973         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1974                             &xmm_mask_lo, &xmm_mask_hi,
1975                             &xmm_dst_lo, &xmm_dst_hi);
1976
1977         save_128_aligned (
1978             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1979
1980         ps += 4;
1981         pd += 4;
1982         pm += 4;
1983         w -= 4;
1984     }
1985
1986     while (w)
1987     {
1988         s = *ps++;
1989         m = *pm++;
1990         d = *pd;
1991
1992         *pd++ = pack_1x128_32 (
1993             pix_multiply_1x128 (
1994                 unpack_32_1x128 (d),
1995                 negate_1x128 (pix_multiply_1x128 (
1996                                  unpack_32_1x128 (m),
1997                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1998         w--;
1999     }
2000 }
2001
2002 static force_inline uint32_t
2003 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2004                                  uint32_t mask,
2005                                  uint32_t dst)
2006 {
2007     __m128i m = unpack_32_1x128 (mask);
2008     __m128i s = unpack_32_1x128 (src);
2009     __m128i d = unpack_32_1x128 (dst);
2010     __m128i sa = expand_alpha_1x128 (s);
2011     __m128i da = expand_alpha_1x128 (d);
2012
2013     s = pix_multiply_1x128 (s, m);
2014     m = negate_1x128 (pix_multiply_1x128 (m, sa));
2015
2016     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2017 }
2018
2019 static void
2020 sse2_combine_atop_ca (pixman_implementation_t *imp,
2021                       pixman_op_t              op,
2022                       uint32_t *               pd,
2023                       const uint32_t *         ps,
2024                       const uint32_t *         pm,
2025                       int                      w)
2026 {
2027     uint32_t s, m, d;
2028
2029     __m128i xmm_src_lo, xmm_src_hi;
2030     __m128i xmm_dst_lo, xmm_dst_hi;
2031     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2032     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2033     __m128i xmm_mask_lo, xmm_mask_hi;
2034
2035     while (w && (unsigned long)pd & 15)
2036     {
2037         s = *ps++;
2038         m = *pm++;
2039         d = *pd;
2040
2041         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2042         w--;
2043     }
2044
2045     while (w >= 4)
2046     {
2047         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2048         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2049         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2050
2051         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2052         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2053         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2054
2055         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2056                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2057         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2058                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2059
2060         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2061                             &xmm_mask_lo, &xmm_mask_hi,
2062                             &xmm_src_lo, &xmm_src_hi);
2063         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2064                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2065                             &xmm_mask_lo, &xmm_mask_hi);
2066
2067         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2068
2069         pix_add_multiply_2x128 (
2070             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2071             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2072             &xmm_dst_lo, &xmm_dst_hi);
2073
2074         save_128_aligned (
2075             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2076
2077         ps += 4;
2078         pd += 4;
2079         pm += 4;
2080         w -= 4;
2081     }
2082
2083     while (w)
2084     {
2085         s = *ps++;
2086         m = *pm++;
2087         d = *pd;
2088
2089         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2090         w--;
2091     }
2092 }
2093
2094 static force_inline uint32_t
2095 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2096                                          uint32_t mask,
2097                                          uint32_t dst)
2098 {
2099     __m128i m = unpack_32_1x128 (mask);
2100     __m128i s = unpack_32_1x128 (src);
2101     __m128i d = unpack_32_1x128 (dst);
2102
2103     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2104     __m128i sa = expand_alpha_1x128 (s);
2105
2106     s = pix_multiply_1x128 (s, m);
2107     m = pix_multiply_1x128 (m, sa);
2108
2109     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2110 }
2111
2112 static void
2113 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2114                               pixman_op_t              op,
2115                               uint32_t *               pd,
2116                               const uint32_t *         ps,
2117                               const uint32_t *         pm,
2118                               int                      w)
2119 {
2120     uint32_t s, m, d;
2121
2122     __m128i xmm_src_lo, xmm_src_hi;
2123     __m128i xmm_dst_lo, xmm_dst_hi;
2124     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2125     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2126     __m128i xmm_mask_lo, xmm_mask_hi;
2127
2128     while (w && (unsigned long)pd & 15)
2129     {
2130         s = *ps++;
2131         m = *pm++;
2132         d = *pd;
2133
2134         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2135         w--;
2136     }
2137
2138     while (w >= 4)
2139     {
2140         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2141         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2142         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2143
2144         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2145         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2146         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2147
2148         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2149                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2150         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2151                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2152
2153         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2154                             &xmm_mask_lo, &xmm_mask_hi,
2155                             &xmm_src_lo, &xmm_src_hi);
2156         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2157                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2158                             &xmm_mask_lo, &xmm_mask_hi);
2159
2160         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2161                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2162
2163         pix_add_multiply_2x128 (
2164             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2165             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2166             &xmm_dst_lo, &xmm_dst_hi);
2167
2168         save_128_aligned (
2169             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2170
2171         ps += 4;
2172         pd += 4;
2173         pm += 4;
2174         w -= 4;
2175     }
2176
2177     while (w)
2178     {
2179         s = *ps++;
2180         m = *pm++;
2181         d = *pd;
2182
2183         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2184         w--;
2185     }
2186 }
2187
2188 static force_inline uint32_t
2189 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2190                                 uint32_t mask,
2191                                 uint32_t dst)
2192 {
2193     __m128i a = unpack_32_1x128 (mask);
2194     __m128i s = unpack_32_1x128 (src);
2195     __m128i d = unpack_32_1x128 (dst);
2196
2197     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2198                                        a, expand_alpha_1x128 (s)));
2199     __m128i dest      = pix_multiply_1x128 (s, a);
2200     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2201
2202     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2203                                                 &alpha_dst,
2204                                                 &dest,
2205                                                 &alpha_src));
2206 }
2207
2208 static void
2209 sse2_combine_xor_ca (pixman_implementation_t *imp,
2210                      pixman_op_t              op,
2211                      uint32_t *               pd,
2212                      const uint32_t *         ps,
2213                      const uint32_t *         pm,
2214                      int                      w)
2215 {
2216     uint32_t s, m, d;
2217
2218     __m128i xmm_src_lo, xmm_src_hi;
2219     __m128i xmm_dst_lo, xmm_dst_hi;
2220     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2221     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2222     __m128i xmm_mask_lo, xmm_mask_hi;
2223
2224     while (w && (unsigned long)pd & 15)
2225     {
2226         s = *ps++;
2227         m = *pm++;
2228         d = *pd;
2229
2230         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2231         w--;
2232     }
2233
2234     while (w >= 4)
2235     {
2236         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2237         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2238         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2239
2240         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2241         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2242         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2243
2244         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2245                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2246         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2247                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2248
2249         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2250                             &xmm_mask_lo, &xmm_mask_hi,
2251                             &xmm_src_lo, &xmm_src_hi);
2252         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2253                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2254                             &xmm_mask_lo, &xmm_mask_hi);
2255
2256         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2257                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2258         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2259                       &xmm_mask_lo, &xmm_mask_hi);
2260
2261         pix_add_multiply_2x128 (
2262             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2263             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2264             &xmm_dst_lo, &xmm_dst_hi);
2265
2266         save_128_aligned (
2267             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2268
2269         ps += 4;
2270         pd += 4;
2271         pm += 4;
2272         w -= 4;
2273     }
2274
2275     while (w)
2276     {
2277         s = *ps++;
2278         m = *pm++;
2279         d = *pd;
2280
2281         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2282         w--;
2283     }
2284 }
2285
2286 static void
2287 sse2_combine_add_ca (pixman_implementation_t *imp,
2288                      pixman_op_t              op,
2289                      uint32_t *               pd,
2290                      const uint32_t *         ps,
2291                      const uint32_t *         pm,
2292                      int                      w)
2293 {
2294     uint32_t s, m, d;
2295
2296     __m128i xmm_src_lo, xmm_src_hi;
2297     __m128i xmm_dst_lo, xmm_dst_hi;
2298     __m128i xmm_mask_lo, xmm_mask_hi;
2299
2300     while (w && (unsigned long)pd & 15)
2301     {
2302         s = *ps++;
2303         m = *pm++;
2304         d = *pd;
2305
2306         *pd++ = pack_1x128_32 (
2307             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2308                                                unpack_32_1x128 (m)),
2309                            unpack_32_1x128 (d)));
2310         w--;
2311     }
2312
2313     while (w >= 4)
2314     {
2315         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2316         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2317         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2318
2319         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2320         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2321         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2322
2323         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2324                             &xmm_mask_lo, &xmm_mask_hi,
2325                             &xmm_src_lo, &xmm_src_hi);
2326
2327         save_128_aligned (
2328             (__m128i*)pd, pack_2x128_128 (
2329                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2330                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2331
2332         ps += 4;
2333         pd += 4;
2334         pm += 4;
2335         w -= 4;
2336     }
2337
2338     while (w)
2339     {
2340         s = *ps++;
2341         m = *pm++;
2342         d = *pd;
2343
2344         *pd++ = pack_1x128_32 (
2345             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2346                                                unpack_32_1x128 (m)),
2347                            unpack_32_1x128 (d)));
2348         w--;
2349     }
2350 }
2351
2352 static force_inline __m128i
2353 create_mask_16_128 (uint16_t mask)
2354 {
2355     return _mm_set1_epi16 (mask);
2356 }
2357
2358 /* Work around a code generation bug in Sun Studio 12. */
2359 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2360 # define create_mask_2x32_128(mask0, mask1)                             \
2361     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2362 #else
2363 static force_inline __m128i
2364 create_mask_2x32_128 (uint32_t mask0,
2365                       uint32_t mask1)
2366 {
2367     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2368 }
2369 #endif
2370
2371 static void
2372 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2373                             pixman_op_t              op,
2374                             pixman_image_t *         src_image,
2375                             pixman_image_t *         mask_image,
2376                             pixman_image_t *         dst_image,
2377                             int32_t                  src_x,
2378                             int32_t                  src_y,
2379                             int32_t                  mask_x,
2380                             int32_t                  mask_y,
2381                             int32_t                  dest_x,
2382                             int32_t                  dest_y,
2383                             int32_t                  width,
2384                             int32_t                  height)
2385 {
2386     uint32_t src;
2387     uint32_t    *dst_line, *dst, d;
2388     int32_t w;
2389     int dst_stride;
2390     __m128i xmm_src, xmm_alpha;
2391     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2392
2393     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2394
2395     if (src == 0)
2396         return;
2397
2398     PIXMAN_IMAGE_GET_LINE (
2399         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2400
2401     xmm_src = expand_pixel_32_1x128 (src);
2402     xmm_alpha = expand_alpha_1x128 (xmm_src);
2403
2404     while (height--)
2405     {
2406         dst = dst_line;
2407
2408         dst_line += dst_stride;
2409         w = width;
2410
2411         while (w && (unsigned long)dst & 15)
2412         {
2413             d = *dst;
2414             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2415                                                 xmm_alpha,
2416                                                 unpack_32_1x128 (d)));
2417             w--;
2418         }
2419
2420         while (w >= 4)
2421         {
2422             xmm_dst = load_128_aligned ((__m128i*)dst);
2423
2424             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2425
2426             over_2x128 (&xmm_src, &xmm_src,
2427                         &xmm_alpha, &xmm_alpha,
2428                         &xmm_dst_lo, &xmm_dst_hi);
2429
2430             /* rebuid the 4 pixel data and save*/
2431             save_128_aligned (
2432                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2433
2434             w -= 4;
2435             dst += 4;
2436         }
2437
2438         while (w)
2439         {
2440             d = *dst;
2441             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2442                                                 xmm_alpha,
2443                                                 unpack_32_1x128 (d)));
2444             w--;
2445         }
2446
2447     }
2448 }
2449
2450 static void
2451 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2452                             pixman_op_t              op,
2453                             pixman_image_t *         src_image,
2454                             pixman_image_t *         mask_image,
2455                             pixman_image_t *         dst_image,
2456                             int32_t                  src_x,
2457                             int32_t                  src_y,
2458                             int32_t                  mask_x,
2459                             int32_t                  mask_y,
2460                             int32_t                  dest_x,
2461                             int32_t                  dest_y,
2462                             int32_t                  width,
2463                             int32_t                  height)
2464 {
2465     uint32_t src;
2466     uint16_t    *dst_line, *dst, d;
2467     int32_t w;
2468     int dst_stride;
2469     __m128i xmm_src, xmm_alpha;
2470     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2471
2472     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2473
2474     if (src == 0)
2475         return;
2476
2477     PIXMAN_IMAGE_GET_LINE (
2478         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2479
2480     xmm_src = expand_pixel_32_1x128 (src);
2481     xmm_alpha = expand_alpha_1x128 (xmm_src);
2482
2483     while (height--)
2484     {
2485         dst = dst_line;
2486
2487         dst_line += dst_stride;
2488         w = width;
2489
2490         while (w && (unsigned long)dst & 15)
2491         {
2492             d = *dst;
2493
2494             *dst++ = pack_565_32_16 (
2495                 pack_1x128_32 (over_1x128 (xmm_src,
2496                                            xmm_alpha,
2497                                            expand565_16_1x128 (d))));
2498             w--;
2499         }
2500
2501         while (w >= 8)
2502         {
2503             xmm_dst = load_128_aligned ((__m128i*)dst);
2504
2505             unpack_565_128_4x128 (xmm_dst,
2506                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507
2508             over_2x128 (&xmm_src, &xmm_src,
2509                         &xmm_alpha, &xmm_alpha,
2510                         &xmm_dst0, &xmm_dst1);
2511             over_2x128 (&xmm_src, &xmm_src,
2512                         &xmm_alpha, &xmm_alpha,
2513                         &xmm_dst2, &xmm_dst3);
2514
2515             xmm_dst = pack_565_4x128_128 (
2516                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2517
2518             save_128_aligned ((__m128i*)dst, xmm_dst);
2519
2520             dst += 8;
2521             w -= 8;
2522         }
2523
2524         while (w--)
2525         {
2526             d = *dst;
2527             *dst++ = pack_565_32_16 (
2528                 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2529                                            expand565_16_1x128 (d))));
2530         }
2531     }
2532
2533 }
2534
2535 static void
2536 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2537                                    pixman_op_t              op,
2538                                    pixman_image_t *         src_image,
2539                                    pixman_image_t *         mask_image,
2540                                    pixman_image_t *         dst_image,
2541                                    int32_t                  src_x,
2542                                    int32_t                  src_y,
2543                                    int32_t                  mask_x,
2544                                    int32_t                  mask_y,
2545                                    int32_t                  dest_x,
2546                                    int32_t                  dest_y,
2547                                    int32_t                  width,
2548                                    int32_t                  height)
2549 {
2550     uint32_t src, srca;
2551     uint32_t    *dst_line, d;
2552     uint32_t    *mask_line, m;
2553     uint32_t pack_cmp;
2554     int dst_stride, mask_stride;
2555
2556     __m128i xmm_src, xmm_alpha;
2557     __m128i xmm_dst;
2558     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2559
2560     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2561
2562     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2563     srca = src >> 24;
2564
2565     if (src == 0)
2566         return;
2567
2568     PIXMAN_IMAGE_GET_LINE (
2569         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2570     PIXMAN_IMAGE_GET_LINE (
2571         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2572
2573     xmm_src = _mm_unpacklo_epi8 (
2574         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2575     xmm_alpha = expand_alpha_1x128 (xmm_src);
2576     mmx_src   = xmm_src;
2577     mmx_alpha = xmm_alpha;
2578
2579     while (height--)
2580     {
2581         int w = width;
2582         const uint32_t *pm = (uint32_t *)mask_line;
2583         uint32_t *pd = (uint32_t *)dst_line;
2584
2585         dst_line += dst_stride;
2586         mask_line += mask_stride;
2587
2588         while (w && (unsigned long)pd & 15)
2589         {
2590             m = *pm++;
2591
2592             if (m)
2593             {
2594                 d = *pd;
2595
2596                 mmx_mask = unpack_32_1x128 (m);
2597                 mmx_dest = unpack_32_1x128 (d);
2598
2599                 *pd = pack_1x128_32 (
2600                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2601             }
2602
2603             pd++;
2604             w--;
2605         }
2606
2607         while (w >= 4)
2608         {
2609             xmm_mask = load_128_unaligned ((__m128i*)pm);
2610
2611             pack_cmp =
2612                 _mm_movemask_epi8 (
2613                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2614
2615             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2616             if (pack_cmp != 0xffff)
2617             {
2618                 xmm_dst = load_128_aligned ((__m128i*)pd);
2619
2620                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2621
2622                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2623                                     &xmm_mask_lo, &xmm_mask_hi,
2624                                     &xmm_mask_lo, &xmm_mask_hi);
2625                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2626
2627                 save_128_aligned (
2628                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2629             }
2630
2631             pd += 4;
2632             pm += 4;
2633             w -= 4;
2634         }
2635
2636         while (w)
2637         {
2638             m = *pm++;
2639
2640             if (m)
2641             {
2642                 d = *pd;
2643
2644                 mmx_mask = unpack_32_1x128 (m);
2645                 mmx_dest = unpack_32_1x128 (d);
2646
2647                 *pd = pack_1x128_32 (
2648                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2649             }
2650
2651             pd++;
2652             w--;
2653         }
2654     }
2655
2656 }
2657
2658 static void
2659 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2660                                     pixman_op_t              op,
2661                                     pixman_image_t *         src_image,
2662                                     pixman_image_t *         mask_image,
2663                                     pixman_image_t *         dst_image,
2664                                     int32_t                  src_x,
2665                                     int32_t                  src_y,
2666                                     int32_t                  mask_x,
2667                                     int32_t                  mask_y,
2668                                     int32_t                  dest_x,
2669                                     int32_t                  dest_y,
2670                                     int32_t                  width,
2671                                     int32_t                  height)
2672 {
2673     uint32_t src;
2674     uint32_t    *dst_line, d;
2675     uint32_t    *mask_line, m;
2676     uint32_t pack_cmp;
2677     int dst_stride, mask_stride;
2678
2679     __m128i xmm_src, xmm_alpha;
2680     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2681     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2682
2683     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2684
2685     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2686
2687     if (src == 0)
2688         return;
2689
2690     PIXMAN_IMAGE_GET_LINE (
2691         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2692     PIXMAN_IMAGE_GET_LINE (
2693         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2694
2695     xmm_src = _mm_unpacklo_epi8 (
2696         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2697     xmm_alpha = expand_alpha_1x128 (xmm_src);
2698     mmx_src   = xmm_src;
2699     mmx_alpha = xmm_alpha;
2700
2701     while (height--)
2702     {
2703         int w = width;
2704         const uint32_t *pm = (uint32_t *)mask_line;
2705         uint32_t *pd = (uint32_t *)dst_line;
2706
2707         dst_line += dst_stride;
2708         mask_line += mask_stride;
2709
2710         while (w && (unsigned long)pd & 15)
2711         {
2712             m = *pm++;
2713
2714             if (m)
2715             {
2716                 d = *pd;
2717                 mmx_mask = unpack_32_1x128 (m);
2718                 mmx_dest = unpack_32_1x128 (d);
2719
2720                 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2721                                                   &mmx_alpha,
2722                                                   &mmx_mask,
2723                                                   &mmx_dest));
2724             }
2725
2726             pd++;
2727             w--;
2728         }
2729
2730         while (w >= 4)
2731         {
2732             xmm_mask = load_128_unaligned ((__m128i*)pm);
2733
2734             pack_cmp =
2735                 _mm_movemask_epi8 (
2736                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2737
2738             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2739             if (pack_cmp != 0xffff)
2740             {
2741                 xmm_dst = load_128_aligned ((__m128i*)pd);
2742
2743                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2744                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2745
2746                 in_over_2x128 (&xmm_src, &xmm_src,
2747                                &xmm_alpha, &xmm_alpha,
2748                                &xmm_mask_lo, &xmm_mask_hi,
2749                                &xmm_dst_lo, &xmm_dst_hi);
2750
2751                 save_128_aligned (
2752                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2753             }
2754
2755             pd += 4;
2756             pm += 4;
2757             w -= 4;
2758         }
2759
2760         while (w)
2761         {
2762             m = *pm++;
2763
2764             if (m)
2765             {
2766                 d = *pd;
2767                 mmx_mask = unpack_32_1x128 (m);
2768                 mmx_dest = unpack_32_1x128 (d);
2769
2770                 *pd = pack_1x128_32 (
2771                     in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2772             }
2773
2774             pd++;
2775             w--;
2776         }
2777     }
2778
2779 }
2780
2781 static void
2782 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2783                                  pixman_op_t              op,
2784                                  pixman_image_t *         src_image,
2785                                  pixman_image_t *         mask_image,
2786                                  pixman_image_t *         dst_image,
2787                                  int32_t                  src_x,
2788                                  int32_t                  src_y,
2789                                  int32_t                  mask_x,
2790                                  int32_t                  mask_y,
2791                                  int32_t                  dest_x,
2792                                  int32_t                  dest_y,
2793                                  int32_t                  width,
2794                                  int32_t                  height)
2795 {
2796     uint32_t    *dst_line, *dst;
2797     uint32_t    *src_line, *src;
2798     uint32_t mask;
2799     int32_t w;
2800     int dst_stride, src_stride;
2801
2802     __m128i xmm_mask;
2803     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2804     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2805     __m128i xmm_alpha_lo, xmm_alpha_hi;
2806
2807     PIXMAN_IMAGE_GET_LINE (
2808         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2809     PIXMAN_IMAGE_GET_LINE (
2810         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2811
2812     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2813
2814     xmm_mask = create_mask_16_128 (mask >> 24);
2815
2816     while (height--)
2817     {
2818         dst = dst_line;
2819         dst_line += dst_stride;
2820         src = src_line;
2821         src_line += src_stride;
2822         w = width;
2823
2824         while (w && (unsigned long)dst & 15)
2825         {
2826             uint32_t s = *src++;
2827
2828             if (s)
2829             {
2830                 uint32_t d = *dst;
2831                 
2832                 __m128i ms = unpack_32_1x128 (s);
2833                 __m128i alpha    = expand_alpha_1x128 (ms);
2834                 __m128i dest     = xmm_mask;
2835                 __m128i alpha_dst = unpack_32_1x128 (d);
2836                 
2837                 *dst = pack_1x128_32 (
2838                     in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2839             }
2840             dst++;
2841             w--;
2842         }
2843
2844         while (w >= 4)
2845         {
2846             xmm_src = load_128_unaligned ((__m128i*)src);
2847
2848             if (!is_zero (xmm_src))
2849             {
2850                 xmm_dst = load_128_aligned ((__m128i*)dst);
2851                 
2852                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2853                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2854                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2855                                     &xmm_alpha_lo, &xmm_alpha_hi);
2856                 
2857                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2858                                &xmm_alpha_lo, &xmm_alpha_hi,
2859                                &xmm_mask, &xmm_mask,
2860                                &xmm_dst_lo, &xmm_dst_hi);
2861                 
2862                 save_128_aligned (
2863                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2864             }
2865                 
2866             dst += 4;
2867             src += 4;
2868             w -= 4;
2869         }
2870
2871         while (w)
2872         {
2873             uint32_t s = *src++;
2874
2875             if (s)
2876             {
2877                 uint32_t d = *dst;
2878                 
2879                 __m128i ms = unpack_32_1x128 (s);
2880                 __m128i alpha = expand_alpha_1x128 (ms);
2881                 __m128i mask  = xmm_mask;
2882                 __m128i dest  = unpack_32_1x128 (d);
2883                 
2884                 *dst = pack_1x128_32 (
2885                     in_over_1x128 (&ms, &alpha, &mask, &dest));
2886             }
2887
2888             dst++;
2889             w--;
2890         }
2891     }
2892
2893 }
2894
2895 static void
2896 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2897                               pixman_op_t              op,
2898                               pixman_image_t *         src_image,
2899                               pixman_image_t *         mask_image,
2900                               pixman_image_t *         dst_image,
2901                               int32_t                  src_x,
2902                               int32_t                  src_y,
2903                               int32_t                  mask_x,
2904                               int32_t                  mask_y,
2905                               int32_t                  dest_x,
2906                               int32_t                  dest_y,
2907                               int32_t                  width,
2908                               int32_t                  height)
2909 {
2910     uint32_t    *dst_line, *dst;
2911     uint32_t    *src_line, *src;
2912     int32_t w;
2913     int dst_stride, src_stride;
2914
2915
2916     PIXMAN_IMAGE_GET_LINE (
2917         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2918     PIXMAN_IMAGE_GET_LINE (
2919         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2920
2921     while (height--)
2922     {
2923         dst = dst_line;
2924         dst_line += dst_stride;
2925         src = src_line;
2926         src_line += src_stride;
2927         w = width;
2928
2929         while (w && (unsigned long)dst & 15)
2930         {
2931             *dst++ = *src++ | 0xff000000;
2932             w--;
2933         }
2934
2935         while (w >= 16)
2936         {
2937             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2938             
2939             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2940             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2941             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2942             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2943             
2944             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2945             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2946             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2947             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2948             
2949             dst += 16;
2950             src += 16;
2951             w -= 16;
2952         }
2953
2954         while (w)
2955         {
2956             *dst++ = *src++ | 0xff000000;
2957             w--;
2958         }
2959     }
2960
2961 }
2962
2963 static void
2964 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2965                                  pixman_op_t              op,
2966                                  pixman_image_t *         src_image,
2967                                  pixman_image_t *         mask_image,
2968                                  pixman_image_t *         dst_image,
2969                                  int32_t                  src_x,
2970                                  int32_t                  src_y,
2971                                  int32_t                  mask_x,
2972                                  int32_t                  mask_y,
2973                                  int32_t                  dest_x,
2974                                  int32_t                  dest_y,
2975                                  int32_t                  width,
2976                                  int32_t                  height)
2977 {
2978     uint32_t    *dst_line, *dst;
2979     uint32_t    *src_line, *src;
2980     uint32_t mask;
2981     int dst_stride, src_stride;
2982     int32_t w;
2983
2984     __m128i xmm_mask, xmm_alpha;
2985     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2986     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2987
2988     PIXMAN_IMAGE_GET_LINE (
2989         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2990     PIXMAN_IMAGE_GET_LINE (
2991         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2992
2993     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2994
2995     xmm_mask = create_mask_16_128 (mask >> 24);
2996     xmm_alpha = mask_00ff;
2997
2998     while (height--)
2999     {
3000         dst = dst_line;
3001         dst_line += dst_stride;
3002         src = src_line;
3003         src_line += src_stride;
3004         w = width;
3005
3006         while (w && (unsigned long)dst & 15)
3007         {
3008             uint32_t s = (*src++) | 0xff000000;
3009             uint32_t d = *dst;
3010
3011             __m128i src   = unpack_32_1x128 (s);
3012             __m128i alpha = xmm_alpha;
3013             __m128i mask  = xmm_mask;
3014             __m128i dest  = unpack_32_1x128 (d);
3015
3016             *dst++ = pack_1x128_32 (
3017                 in_over_1x128 (&src, &alpha, &mask, &dest));
3018
3019             w--;
3020         }
3021
3022         while (w >= 4)
3023         {
3024             xmm_src = _mm_or_si128 (
3025                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3026             xmm_dst = load_128_aligned ((__m128i*)dst);
3027
3028             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3029             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3030
3031             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3032                            &xmm_alpha, &xmm_alpha,
3033                            &xmm_mask, &xmm_mask,
3034                            &xmm_dst_lo, &xmm_dst_hi);
3035
3036             save_128_aligned (
3037                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3038
3039             dst += 4;
3040             src += 4;
3041             w -= 4;
3042
3043         }
3044
3045         while (w)
3046         {
3047             uint32_t s = (*src++) | 0xff000000;
3048             uint32_t d = *dst;
3049
3050             __m128i src  = unpack_32_1x128 (s);
3051             __m128i alpha = xmm_alpha;
3052             __m128i mask  = xmm_mask;
3053             __m128i dest  = unpack_32_1x128 (d);
3054
3055             *dst++ = pack_1x128_32 (
3056                 in_over_1x128 (&src, &alpha, &mask, &dest));
3057
3058             w--;
3059         }
3060     }
3061
3062 }
3063
3064 static void
3065 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3066                                pixman_op_t              op,
3067                                pixman_image_t *         src_image,
3068                                pixman_image_t *         mask_image,
3069                                pixman_image_t *         dst_image,
3070                                int32_t                  src_x,
3071                                int32_t                  src_y,
3072                                int32_t                  mask_x,
3073                                int32_t                  mask_y,
3074                                int32_t                  dest_x,
3075                                int32_t                  dest_y,
3076                                int32_t                  width,
3077                                int32_t                  height)
3078 {
3079     int dst_stride, src_stride;
3080     uint32_t    *dst_line, *dst;
3081     uint32_t    *src_line, *src;
3082
3083     PIXMAN_IMAGE_GET_LINE (
3084         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3085     PIXMAN_IMAGE_GET_LINE (
3086         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3087
3088     dst = dst_line;
3089     src = src_line;
3090
3091     while (height--)
3092     {
3093         sse2_combine_over_u (imp, op, dst, src, NULL, width);
3094
3095         dst += dst_stride;
3096         src += src_stride;
3097     }
3098 }
3099
3100 static force_inline uint16_t
3101 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3102 {
3103     __m128i ms;
3104
3105     ms = unpack_32_1x128 (src);
3106     return pack_565_32_16 (
3107         pack_1x128_32 (
3108             over_1x128 (
3109                 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3110 }
3111
3112 static void
3113 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3114                                pixman_op_t              op,
3115                                pixman_image_t *         src_image,
3116                                pixman_image_t *         mask_image,
3117                                pixman_image_t *         dst_image,
3118                                int32_t                  src_x,
3119                                int32_t                  src_y,
3120                                int32_t                  mask_x,
3121                                int32_t                  mask_y,
3122                                int32_t                  dest_x,
3123                                int32_t                  dest_y,
3124                                int32_t                  width,
3125                                int32_t                  height)
3126 {
3127     uint16_t    *dst_line, *dst, d;
3128     uint32_t    *src_line, *src, s;
3129     int dst_stride, src_stride;
3130     int32_t w;
3131
3132     __m128i xmm_alpha_lo, xmm_alpha_hi;
3133     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3134     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3135
3136     PIXMAN_IMAGE_GET_LINE (
3137         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3138     PIXMAN_IMAGE_GET_LINE (
3139         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3140
3141     while (height--)
3142     {
3143         dst = dst_line;
3144         src = src_line;
3145
3146         dst_line += dst_stride;
3147         src_line += src_stride;
3148         w = width;
3149
3150         /* Align dst on a 16-byte boundary */
3151         while (w &&
3152                ((unsigned long)dst & 15))
3153         {
3154             s = *src++;
3155             d = *dst;
3156
3157             *dst++ = composite_over_8888_0565pixel (s, d);
3158             w--;
3159         }
3160
3161         /* It's a 8 pixel loop */
3162         while (w >= 8)
3163         {
3164             /* I'm loading unaligned because I'm not sure
3165              * about the address alignment.
3166              */
3167             xmm_src = load_128_unaligned ((__m128i*) src);
3168             xmm_dst = load_128_aligned ((__m128i*) dst);
3169
3170             /* Unpacking */
3171             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3172             unpack_565_128_4x128 (xmm_dst,
3173                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3174             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3175                                 &xmm_alpha_lo, &xmm_alpha_hi);
3176
3177             /* I'm loading next 4 pixels from memory
3178              * before to optimze the memory read.
3179              */
3180             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3181
3182             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3183                         &xmm_alpha_lo, &xmm_alpha_hi,
3184                         &xmm_dst0, &xmm_dst1);
3185
3186             /* Unpacking */
3187             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3188             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3189                                 &xmm_alpha_lo, &xmm_alpha_hi);
3190
3191             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3192                         &xmm_alpha_lo, &xmm_alpha_hi,
3193                         &xmm_dst2, &xmm_dst3);
3194
3195             save_128_aligned (
3196                 (__m128i*)dst, pack_565_4x128_128 (
3197                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3198
3199             w -= 8;
3200             dst += 8;
3201             src += 8;
3202         }
3203
3204         while (w--)
3205         {
3206             s = *src++;
3207             d = *dst;
3208
3209             *dst++ = composite_over_8888_0565pixel (s, d);
3210         }
3211     }
3212
3213 }
3214
3215 static void
3216 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3217                               pixman_op_t              op,
3218                               pixman_image_t *         src_image,
3219                               pixman_image_t *         mask_image,
3220                               pixman_image_t *         dst_image,
3221                               int32_t                  src_x,
3222                               int32_t                  src_y,
3223                               int32_t                  mask_x,
3224                               int32_t                  mask_y,
3225                               int32_t                  dest_x,
3226                               int32_t                  dest_y,
3227                               int32_t                  width,
3228                               int32_t                  height)
3229 {
3230     uint32_t src, srca;
3231     uint32_t *dst_line, *dst;
3232     uint8_t *mask_line, *mask;
3233     int dst_stride, mask_stride;
3234     int32_t w;
3235     uint32_t m, d;
3236
3237     __m128i xmm_src, xmm_alpha, xmm_def;
3238     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3239     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3240
3241     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3242
3243     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3244
3245     srca = src >> 24;
3246     if (src == 0)
3247         return;
3248
3249     PIXMAN_IMAGE_GET_LINE (
3250         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3251     PIXMAN_IMAGE_GET_LINE (
3252         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3253
3254     xmm_def = create_mask_2x32_128 (src, src);
3255     xmm_src = expand_pixel_32_1x128 (src);
3256     xmm_alpha = expand_alpha_1x128 (xmm_src);
3257     mmx_src   = xmm_src;
3258     mmx_alpha = xmm_alpha;
3259
3260     while (height--)
3261     {
3262         dst = dst_line;
3263         dst_line += dst_stride;
3264         mask = mask_line;
3265         mask_line += mask_stride;
3266         w = width;
3267
3268         while (w && (unsigned long)dst & 15)
3269         {
3270             uint8_t m = *mask++;
3271
3272             if (m)
3273             {
3274                 d = *dst;
3275                 mmx_mask = expand_pixel_8_1x128 (m);
3276                 mmx_dest = unpack_32_1x128 (d);
3277
3278                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3279                                                    &mmx_alpha,
3280                                                    &mmx_mask,
3281                                                    &mmx_dest));
3282             }
3283
3284             w--;
3285             dst++;
3286         }
3287
3288         while (w >= 4)
3289         {
3290             m = *((uint32_t*)mask);
3291
3292             if (srca == 0xff && m == 0xffffffff)
3293             {
3294                 save_128_aligned ((__m128i*)dst, xmm_def);
3295             }
3296             else if (m)
3297             {
3298                 xmm_dst = load_128_aligned ((__m128i*) dst);
3299                 xmm_mask = unpack_32_1x128 (m);
3300                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3301
3302                 /* Unpacking */
3303                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3304                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3305
3306                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3307                                         &xmm_mask_lo, &xmm_mask_hi);
3308
3309                 in_over_2x128 (&xmm_src, &xmm_src,
3310                                &xmm_alpha, &xmm_alpha,
3311                                &xmm_mask_lo, &xmm_mask_hi,
3312                                &xmm_dst_lo, &xmm_dst_hi);
3313
3314                 save_128_aligned (
3315                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3316             }
3317
3318             w -= 4;
3319             dst += 4;
3320             mask += 4;
3321         }
3322
3323         while (w)
3324         {
3325             uint8_t m = *mask++;
3326
3327             if (m)
3328             {
3329                 d = *dst;
3330                 mmx_mask = expand_pixel_8_1x128 (m);
3331                 mmx_dest = unpack_32_1x128 (d);
3332
3333                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3334                                                    &mmx_alpha,
3335                                                    &mmx_mask,
3336                                                    &mmx_dest));
3337             }
3338
3339             w--;
3340             dst++;
3341         }
3342     }
3343
3344 }
3345
3346 pixman_bool_t
3347 pixman_fill_sse2 (uint32_t *bits,
3348                   int       stride,
3349                   int       bpp,
3350                   int       x,
3351                   int       y,
3352                   int       width,
3353                   int       height,
3354                   uint32_t  data)
3355 {
3356     uint32_t byte_width;
3357     uint8_t         *byte_line;
3358
3359     __m128i xmm_def;
3360
3361     if (bpp == 8)
3362     {
3363         uint8_t b;
3364         uint16_t w;
3365
3366         stride = stride * (int) sizeof (uint32_t) / 1;
3367         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3368         byte_width = width;
3369         stride *= 1;
3370
3371         b = data & 0xff;
3372         w = (b << 8) | b;
3373         data = (w << 16) | w;
3374     }
3375     else if (bpp == 16)
3376     {
3377         stride = stride * (int) sizeof (uint32_t) / 2;
3378         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3379         byte_width = 2 * width;
3380         stride *= 2;
3381
3382         data = (data & 0xffff) * 0x00010001;
3383     }
3384     else if (bpp == 32)
3385     {
3386         stride = stride * (int) sizeof (uint32_t) / 4;
3387         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3388         byte_width = 4 * width;
3389         stride *= 4;
3390     }
3391     else
3392     {
3393         return FALSE;
3394     }
3395
3396     xmm_def = create_mask_2x32_128 (data, data);
3397
3398     while (height--)
3399     {
3400         int w;
3401         uint8_t *d = byte_line;
3402         byte_line += stride;
3403         w = byte_width;
3404
3405         while (w >= 1 && ((unsigned long)d & 1))
3406         {
3407             *(uint8_t *)d = data;
3408             w -= 1;
3409             d += 1;
3410         }
3411
3412         while (w >= 2 && ((unsigned long)d & 3))
3413         {
3414             *(uint16_t *)d = data;
3415             w -= 2;
3416             d += 2;
3417         }
3418
3419         while (w >= 4 && ((unsigned long)d & 15))
3420         {
3421             *(uint32_t *)d = data;
3422
3423             w -= 4;
3424             d += 4;
3425         }
3426
3427         while (w >= 128)
3428         {
3429             save_128_aligned ((__m128i*)(d),     xmm_def);
3430             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3431             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3432             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3433             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3434             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3435             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3436             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3437
3438             d += 128;
3439             w -= 128;
3440         }
3441
3442         if (w >= 64)
3443         {
3444             save_128_aligned ((__m128i*)(d),     xmm_def);
3445             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3446             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3447             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3448
3449             d += 64;
3450             w -= 64;
3451         }
3452
3453         if (w >= 32)
3454         {
3455             save_128_aligned ((__m128i*)(d),     xmm_def);
3456             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3457
3458             d += 32;
3459             w -= 32;
3460         }
3461
3462         if (w >= 16)
3463         {
3464             save_128_aligned ((__m128i*)(d),     xmm_def);
3465
3466             d += 16;
3467             w -= 16;
3468         }
3469
3470         while (w >= 4)
3471         {
3472             *(uint32_t *)d = data;
3473
3474             w -= 4;
3475             d += 4;
3476         }
3477
3478         if (w >= 2)
3479         {
3480             *(uint16_t *)d = data;
3481             w -= 2;
3482             d += 2;
3483         }
3484
3485         if (w >= 1)
3486         {
3487             *(uint8_t *)d = data;
3488             w -= 1;
3489             d += 1;
3490         }
3491     }
3492
3493     return TRUE;
3494 }
3495
3496 static void
3497 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3498                              pixman_op_t              op,
3499                              pixman_image_t *         src_image,
3500                              pixman_image_t *         mask_image,
3501                              pixman_image_t *         dst_image,
3502                              int32_t                  src_x,
3503                              int32_t                  src_y,
3504                              int32_t                  mask_x,
3505                              int32_t                  mask_y,
3506                              int32_t                  dest_x,
3507                              int32_t                  dest_y,
3508                              int32_t                  width,
3509                              int32_t                  height)
3510 {
3511     uint32_t src, srca;
3512     uint32_t    *dst_line, *dst;
3513     uint8_t     *mask_line, *mask;
3514     int dst_stride, mask_stride;
3515     int32_t w;
3516     uint32_t m;
3517
3518     __m128i xmm_src, xmm_def;
3519     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3520
3521     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3522
3523     srca = src >> 24;
3524     if (src == 0)
3525     {
3526         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3527                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
3528                           dest_x, dest_y, width, height, 0);
3529         return;
3530     }
3531
3532     PIXMAN_IMAGE_GET_LINE (
3533         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3534     PIXMAN_IMAGE_GET_LINE (
3535         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3536
3537     xmm_def = create_mask_2x32_128 (src, src);
3538     xmm_src = expand_pixel_32_1x128 (src);
3539
3540     while (height--)
3541     {
3542         dst = dst_line;
3543         dst_line += dst_stride;
3544         mask = mask_line;
3545         mask_line += mask_stride;
3546         w = width;
3547
3548         while (w && (unsigned long)dst & 15)
3549         {
3550             uint8_t m = *mask++;
3551
3552             if (m)
3553             {
3554                 *dst = pack_1x128_32 (
3555                     pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3556             }
3557             else
3558             {
3559                 *dst = 0;
3560             }
3561
3562             w--;
3563             dst++;
3564         }
3565
3566         while (w >= 4)
3567         {
3568             m = *((uint32_t*)mask);
3569
3570             if (srca == 0xff && m == 0xffffffff)
3571             {
3572                 save_128_aligned ((__m128i*)dst, xmm_def);
3573             }
3574             else if (m)
3575             {
3576                 xmm_mask = unpack_32_1x128 (m);
3577                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3578
3579                 /* Unpacking */
3580                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3581
3582                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3583                                         &xmm_mask_lo, &xmm_mask_hi);
3584
3585                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3586                                     &xmm_mask_lo, &xmm_mask_hi,
3587                                     &xmm_mask_lo, &xmm_mask_hi);
3588
3589                 save_128_aligned (
3590                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3591             }
3592             else
3593             {
3594                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3595             }
3596
3597             w -= 4;
3598             dst += 4;
3599             mask += 4;
3600         }
3601
3602         while (w)
3603         {
3604             uint8_t m = *mask++;
3605
3606             if (m)
3607             {
3608                 *dst = pack_1x128_32 (
3609                     pix_multiply_1x128 (
3610                         xmm_src, expand_pixel_8_1x128 (m)));
3611             }
3612             else
3613             {
3614                 *dst = 0;
3615             }
3616
3617             w--;
3618             dst++;
3619         }
3620     }
3621
3622 }
3623
3624 static void
3625 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3626                               pixman_op_t              op,
3627                               pixman_image_t *         src_image,
3628                               pixman_image_t *         mask_image,
3629                               pixman_image_t *         dst_image,
3630                               int32_t                  src_x,
3631                               int32_t                  src_y,
3632                               int32_t                  mask_x,
3633                               int32_t                  mask_y,
3634                               int32_t                  dest_x,
3635                               int32_t                  dest_y,
3636                               int32_t                  width,
3637                               int32_t                  height)
3638 {
3639     uint32_t src, srca;
3640     uint16_t    *dst_line, *dst, d;
3641     uint8_t     *mask_line, *mask;
3642     int dst_stride, mask_stride;
3643     int32_t w;
3644     uint32_t m;
3645     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3646
3647     __m128i xmm_src, xmm_alpha;
3648     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3649     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3650
3651     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3652
3653     srca = src >> 24;
3654     if (src == 0)
3655         return;
3656
3657     PIXMAN_IMAGE_GET_LINE (
3658         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3659     PIXMAN_IMAGE_GET_LINE (
3660         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3661
3662     xmm_src = expand_pixel_32_1x128 (src);
3663     xmm_alpha = expand_alpha_1x128 (xmm_src);
3664     mmx_src = xmm_src;
3665     mmx_alpha = xmm_alpha;
3666
3667     while (height--)
3668     {
3669         dst = dst_line;
3670         dst_line += dst_stride;
3671         mask = mask_line;
3672         mask_line += mask_stride;
3673         w = width;
3674
3675         while (w && (unsigned long)dst & 15)
3676         {
3677             m = *mask++;
3678
3679             if (m)
3680             {
3681                 d = *dst;
3682                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3683                 mmx_dest = expand565_16_1x128 (d);
3684
3685                 *dst = pack_565_32_16 (
3686                     pack_1x128_32 (
3687                         in_over_1x128 (
3688                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3689             }
3690
3691             w--;
3692             dst++;
3693         }
3694
3695         while (w >= 8)
3696         {
3697             xmm_dst = load_128_aligned ((__m128i*) dst);
3698             unpack_565_128_4x128 (xmm_dst,
3699                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3700
3701             m = *((uint32_t*)mask);
3702             mask += 4;
3703
3704             if (m)
3705             {
3706                 xmm_mask = unpack_32_1x128 (m);
3707                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3708
3709                 /* Unpacking */
3710                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3711
3712                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3713                                         &xmm_mask_lo, &xmm_mask_hi);
3714
3715                 in_over_2x128 (&xmm_src, &xmm_src,
3716                                &xmm_alpha, &xmm_alpha,
3717                                &xmm_mask_lo, &xmm_mask_hi,
3718                                &xmm_dst0, &xmm_dst1);
3719             }
3720
3721             m = *((uint32_t*)mask);
3722             mask += 4;
3723
3724             if (m)
3725             {
3726                 xmm_mask = unpack_32_1x128 (m);
3727                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3728
3729                 /* Unpacking */
3730                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3731
3732                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3733                                         &xmm_mask_lo, &xmm_mask_hi);
3734                 in_over_2x128 (&xmm_src, &xmm_src,
3735                                &xmm_alpha, &xmm_alpha,
3736                                &xmm_mask_lo, &xmm_mask_hi,
3737                                &xmm_dst2, &xmm_dst3);
3738             }
3739
3740             save_128_aligned (
3741                 (__m128i*)dst, pack_565_4x128_128 (
3742                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3743
3744             w -= 8;
3745             dst += 8;
3746         }
3747
3748         while (w)
3749         {
3750             m = *mask++;
3751
3752             if (m)
3753             {
3754                 d = *dst;
3755                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3756                 mmx_dest = expand565_16_1x128 (d);
3757
3758                 *dst = pack_565_32_16 (
3759                     pack_1x128_32 (
3760                         in_over_1x128 (
3761                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3762             }
3763
3764             w--;
3765             dst++;
3766         }
3767     }
3768
3769 }
3770
3771 static void
3772 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3773                                  pixman_op_t              op,
3774                                  pixman_image_t *         src_image,
3775                                  pixman_image_t *         mask_image,
3776                                  pixman_image_t *         dst_image,
3777                                  int32_t                  src_x,
3778                                  int32_t                  src_y,
3779                                  int32_t                  mask_x,
3780                                  int32_t                  mask_y,
3781                                  int32_t                  dest_x,
3782                                  int32_t                  dest_y,
3783                                  int32_t                  width,
3784                                  int32_t                  height)
3785 {
3786     uint16_t    *dst_line, *dst, d;
3787     uint32_t    *src_line, *src, s;
3788     int dst_stride, src_stride;
3789     int32_t w;
3790     uint32_t opaque, zero;
3791
3792     __m128i ms;
3793     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3794     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3795
3796     PIXMAN_IMAGE_GET_LINE (
3797         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3798     PIXMAN_IMAGE_GET_LINE (
3799         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3800
3801     while (height--)
3802     {
3803         dst = dst_line;
3804         dst_line += dst_stride;
3805         src = src_line;
3806         src_line += src_stride;
3807         w = width;
3808
3809         while (w && (unsigned long)dst & 15)
3810         {
3811             s = *src++;
3812             d = *dst;
3813
3814             ms = unpack_32_1x128 (s);
3815
3816             *dst++ = pack_565_32_16 (
3817                 pack_1x128_32 (
3818                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3819             w--;
3820         }
3821
3822         while (w >= 8)
3823         {
3824             /* First round */
3825             xmm_src = load_128_unaligned ((__m128i*)src);
3826             xmm_dst = load_128_aligned  ((__m128i*)dst);
3827
3828             opaque = is_opaque (xmm_src);
3829             zero = is_zero (xmm_src);
3830
3831             unpack_565_128_4x128 (xmm_dst,
3832                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3833             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3834
3835             /* preload next round*/
3836             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3837
3838             if (opaque)
3839             {
3840                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3841                                      &xmm_dst0, &xmm_dst1);
3842             }
3843             else if (!zero)
3844             {
3845                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3846                                         &xmm_dst0, &xmm_dst1);
3847             }
3848
3849             /* Second round */
3850             opaque = is_opaque (xmm_src);
3851             zero = is_zero (xmm_src);
3852
3853             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3854
3855             if (opaque)
3856             {
3857                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3858                                      &xmm_dst2, &xmm_dst3);
3859             }
3860             else if (!zero)
3861             {
3862                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3863                                         &xmm_dst2, &xmm_dst3);
3864             }
3865
3866             save_128_aligned (
3867                 (__m128i*)dst, pack_565_4x128_128 (
3868                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3869
3870             w -= 8;
3871             src += 8;
3872             dst += 8;
3873         }
3874
3875         while (w)
3876         {
3877             s = *src++;
3878             d = *dst;
3879
3880             ms = unpack_32_1x128 (s);
3881
3882             *dst++ = pack_565_32_16 (
3883                 pack_1x128_32 (
3884                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3885             w--;
3886         }
3887     }
3888
3889 }
3890
3891 static void
3892 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3893                                  pixman_op_t              op,
3894                                  pixman_image_t *         src_image,
3895                                  pixman_image_t *         mask_image,
3896                                  pixman_image_t *         dst_image,
3897                                  int32_t                  src_x,
3898                                  int32_t                  src_y,
3899                                  int32_t                  mask_x,
3900                                  int32_t                  mask_y,
3901                                  int32_t                  dest_x,
3902                                  int32_t                  dest_y,
3903                                  int32_t                  width,
3904                                  int32_t                  height)
3905 {
3906     uint32_t    *dst_line, *dst, d;
3907     uint32_t    *src_line, *src, s;
3908     int dst_stride, src_stride;
3909     int32_t w;
3910     uint32_t opaque, zero;
3911
3912     __m128i xmm_src_lo, xmm_src_hi;
3913     __m128i xmm_dst_lo, xmm_dst_hi;
3914
3915     PIXMAN_IMAGE_GET_LINE (
3916         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3917     PIXMAN_IMAGE_GET_LINE (
3918         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3919
3920     while (height--)
3921     {
3922         dst = dst_line;
3923         dst_line += dst_stride;
3924         src = src_line;
3925         src_line += src_stride;
3926         w = width;
3927
3928         while (w && (unsigned long)dst & 15)
3929         {
3930             s = *src++;
3931             d = *dst;
3932
3933             *dst++ = pack_1x128_32 (
3934                 over_rev_non_pre_1x128 (
3935                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3936
3937             w--;
3938         }
3939
3940         while (w >= 4)
3941         {
3942             xmm_src_hi = load_128_unaligned ((__m128i*)src);
3943
3944             opaque = is_opaque (xmm_src_hi);
3945             zero = is_zero (xmm_src_hi);
3946
3947             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3948
3949             if (opaque)
3950             {
3951                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3952                                      &xmm_dst_lo, &xmm_dst_hi);
3953
3954                 save_128_aligned (
3955                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3956             }
3957             else if (!zero)
3958             {
3959                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
3960
3961                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3962
3963                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3964                                         &xmm_dst_lo, &xmm_dst_hi);
3965
3966                 save_128_aligned (
3967                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3968             }
3969
3970             w -= 4;
3971             dst += 4;
3972             src += 4;
3973         }
3974
3975         while (w)
3976         {
3977             s = *src++;
3978             d = *dst;
3979
3980             *dst++ = pack_1x128_32 (
3981                 over_rev_non_pre_1x128 (
3982                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3983
3984             w--;
3985         }
3986     }
3987
3988 }
3989
3990 static void
3991 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3992                                     pixman_op_t              op,
3993                                     pixman_image_t *         src_image,
3994                                     pixman_image_t *         mask_image,
3995                                     pixman_image_t *         dst_image,
3996                                     int32_t                  src_x,
3997                                     int32_t                  src_y,
3998                                     int32_t                  mask_x,
3999                                     int32_t                  mask_y,
4000                                     int32_t                  dest_x,
4001                                     int32_t                  dest_y,
4002                                     int32_t                  width,
4003                                     int32_t                  height)
4004 {
4005     uint32_t src;
4006     uint16_t    *dst_line, *dst, d;
4007     uint32_t    *mask_line, *mask, m;
4008     int dst_stride, mask_stride;
4009     int w;
4010     uint32_t pack_cmp;
4011
4012     __m128i xmm_src, xmm_alpha;
4013     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4014     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4015
4016     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4017
4018     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4019
4020     if (src == 0)
4021         return;
4022
4023     PIXMAN_IMAGE_GET_LINE (
4024         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4025     PIXMAN_IMAGE_GET_LINE (
4026         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4027
4028     xmm_src = expand_pixel_32_1x128 (src);
4029     xmm_alpha = expand_alpha_1x128 (xmm_src);
4030     mmx_src = xmm_src;
4031     mmx_alpha = xmm_alpha;
4032
4033     while (height--)
4034     {
4035         w = width;
4036         mask = mask_line;
4037         dst = dst_line;
4038         mask_line += mask_stride;
4039         dst_line += dst_stride;
4040
4041         while (w && ((unsigned long)dst & 15))
4042         {
4043             m = *(uint32_t *) mask;
4044
4045             if (m)
4046             {
4047                 d = *dst;
4048                 mmx_mask = unpack_32_1x128 (m);
4049                 mmx_dest = expand565_16_1x128 (d);
4050
4051                 *dst = pack_565_32_16 (
4052                     pack_1x128_32 (
4053                         in_over_1x128 (
4054                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4055             }
4056
4057             w--;
4058             dst++;
4059             mask++;
4060         }
4061
4062         while (w >= 8)
4063         {
4064             /* First round */
4065             xmm_mask = load_128_unaligned ((__m128i*)mask);
4066             xmm_dst = load_128_aligned ((__m128i*)dst);
4067
4068             pack_cmp = _mm_movemask_epi8 (
4069                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4070
4071             unpack_565_128_4x128 (xmm_dst,
4072                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4073             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4074
4075             /* preload next round */
4076             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4077
4078             /* preload next round */
4079             if (pack_cmp != 0xffff)
4080             {
4081                 in_over_2x128 (&xmm_src, &xmm_src,
4082                                &xmm_alpha, &xmm_alpha,
4083                                &xmm_mask_lo, &xmm_mask_hi,
4084                                &xmm_dst0, &xmm_dst1);
4085             }
4086
4087             /* Second round */
4088             pack_cmp = _mm_movemask_epi8 (
4089                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4090
4091             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4092
4093             if (pack_cmp != 0xffff)
4094             {
4095                 in_over_2x128 (&xmm_src, &xmm_src,
4096                                &xmm_alpha, &xmm_alpha,
4097                                &xmm_mask_lo, &xmm_mask_hi,
4098                                &xmm_dst2, &xmm_dst3);
4099             }
4100
4101             save_128_aligned (
4102                 (__m128i*)dst, pack_565_4x128_128 (
4103                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4104
4105             w -= 8;
4106             dst += 8;
4107             mask += 8;
4108         }
4109
4110         while (w)
4111         {
4112             m = *(uint32_t *) mask;
4113
4114             if (m)
4115             {
4116                 d = *dst;
4117                 mmx_mask = unpack_32_1x128 (m);
4118                 mmx_dest = expand565_16_1x128 (d);
4119
4120                 *dst = pack_565_32_16 (
4121                     pack_1x128_32 (
4122                         in_over_1x128 (
4123                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4124             }
4125
4126             w--;
4127             dst++;
4128             mask++;
4129         }
4130     }
4131
4132 }
4133
4134 static void
4135 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4136                          pixman_op_t              op,
4137                          pixman_image_t *         src_image,
4138                          pixman_image_t *         mask_image,
4139                          pixman_image_t *         dst_image,
4140                          int32_t                  src_x,
4141                          int32_t                  src_y,
4142                          int32_t                  mask_x,
4143                          int32_t                  mask_y,
4144                          int32_t                  dest_x,
4145                          int32_t                  dest_y,
4146                          int32_t                  width,
4147                          int32_t                  height)
4148 {
4149     uint8_t     *dst_line, *dst;
4150     uint8_t     *mask_line, *mask;
4151     int dst_stride, mask_stride;
4152     uint32_t d, m;
4153     uint32_t src;
4154     uint8_t sa;
4155     int32_t w;
4156
4157     __m128i xmm_alpha;
4158     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4159     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4160
4161     PIXMAN_IMAGE_GET_LINE (
4162         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4163     PIXMAN_IMAGE_GET_LINE (
4164         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4165
4166     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4167
4168     sa = src >> 24;
4169
4170     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4171
4172     while (height--)
4173     {
4174         dst = dst_line;
4175         dst_line += dst_stride;
4176         mask = mask_line;
4177         mask_line += mask_stride;
4178         w = width;
4179
4180         while (w && ((unsigned long)dst & 15))
4181         {
4182             m = (uint32_t) *mask++;
4183             d = (uint32_t) *dst;
4184
4185             *dst++ = (uint8_t) pack_1x128_32 (
4186                 pix_multiply_1x128 (
4187                     pix_multiply_1x128 (xmm_alpha,
4188                                        unpack_32_1x128 (m)),
4189                     unpack_32_1x128 (d)));
4190             w--;
4191         }
4192
4193         while (w >= 16)
4194         {
4195             xmm_mask = load_128_unaligned ((__m128i*)mask);
4196             xmm_dst = load_128_aligned ((__m128i*)dst);
4197
4198             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4199             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4200
4201             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4202                                 &xmm_mask_lo, &xmm_mask_hi,
4203                                 &xmm_mask_lo, &xmm_mask_hi);
4204
4205             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4206                                 &xmm_dst_lo, &xmm_dst_hi,
4207                                 &xmm_dst_lo, &xmm_dst_hi);
4208
4209             save_128_aligned (
4210                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4211
4212             mask += 16;
4213             dst += 16;
4214             w -= 16;
4215         }
4216
4217         while (w)
4218         {
4219             m = (uint32_t) *mask++;
4220             d = (uint32_t) *dst;
4221
4222             *dst++ = (uint8_t) pack_1x128_32 (
4223                 pix_multiply_1x128 (
4224                     pix_multiply_1x128 (
4225                         xmm_alpha, unpack_32_1x128 (m)),
4226                     unpack_32_1x128 (d)));
4227             w--;
4228         }
4229     }
4230
4231 }
4232
4233 static void
4234 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4235                        pixman_op_t              op,
4236                        pixman_image_t *         src_image,
4237                        pixman_image_t *         mask_image,
4238                        pixman_image_t *         dst_image,
4239                        int32_t                  src_x,
4240                        int32_t                  src_y,
4241                        int32_t                  mask_x,
4242                        int32_t                  mask_y,
4243                        int32_t                  dest_x,
4244                        int32_t                  dest_y,
4245                        int32_t                  width,
4246                        int32_t                  height)
4247 {
4248     uint8_t     *dst_line, *dst;
4249     int dst_stride;
4250     uint32_t d;
4251     uint32_t src;
4252     int32_t w;
4253
4254     __m128i xmm_alpha;
4255     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4256
4257     PIXMAN_IMAGE_GET_LINE (
4258         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4259
4260     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4261
4262     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4263
4264     src = src >> 24;
4265
4266     if (src == 0xff)
4267         return;
4268
4269     if (src == 0x00)
4270     {
4271         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4272                      8, dest_x, dest_y, width, height, src);
4273
4274         return;
4275     }
4276
4277     while (height--)
4278     {
4279         dst = dst_line;
4280         dst_line += dst_stride;
4281         w = width;
4282
4283         while (w && ((unsigned long)dst & 15))
4284         {
4285             d = (uint32_t) *dst;
4286
4287             *dst++ = (uint8_t) pack_1x128_32 (
4288                 pix_multiply_1x128 (
4289                     xmm_alpha,
4290                     unpack_32_1x128 (d)));
4291             w--;
4292         }
4293
4294         while (w >= 16)
4295         {
4296             xmm_dst = load_128_aligned ((__m128i*)dst);
4297
4298             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4299             
4300             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4301                                 &xmm_dst_lo, &xmm_dst_hi,
4302                                 &xmm_dst_lo, &xmm_dst_hi);
4303
4304             save_128_aligned (
4305                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4306
4307             dst += 16;
4308             w -= 16;
4309         }
4310
4311         while (w)
4312         {
4313             d = (uint32_t) *dst;
4314
4315             *dst++ = (uint8_t) pack_1x128_32 (
4316                 pix_multiply_1x128 (
4317                     xmm_alpha,
4318                     unpack_32_1x128 (d)));
4319             w--;
4320         }
4321     }
4322
4323 }
4324
4325 static void
4326 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4327                        pixman_op_t              op,
4328                        pixman_image_t *         src_image,
4329                        pixman_image_t *         mask_image,
4330                        pixman_image_t *         dst_image,
4331                        int32_t                  src_x,
4332                        int32_t                  src_y,
4333                        int32_t                  mask_x,
4334                        int32_t                  mask_y,
4335                        int32_t                  dest_x,
4336                        int32_t                  dest_y,
4337                        int32_t                  width,
4338                        int32_t                  height)
4339 {
4340     uint8_t     *dst_line, *dst;
4341     uint8_t     *src_line, *src;
4342     int src_stride, dst_stride;
4343     int32_t w;
4344     uint32_t s, d;
4345
4346     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4347     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4348
4349     PIXMAN_IMAGE_GET_LINE (
4350         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4351     PIXMAN_IMAGE_GET_LINE (
4352         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4353
4354     while (height--)
4355     {
4356         dst = dst_line;
4357         dst_line += dst_stride;
4358         src = src_line;
4359         src_line += src_stride;
4360         w = width;
4361
4362         while (w && ((unsigned long)dst & 15))
4363         {
4364             s = (uint32_t) *src++;
4365             d = (uint32_t) *dst;
4366
4367             *dst++ = (uint8_t) pack_1x128_32 (
4368                 pix_multiply_1x128 (
4369                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4370             w--;
4371         }
4372
4373         while (w >= 16)
4374         {
4375             xmm_src = load_128_unaligned ((__m128i*)src);
4376             xmm_dst = load_128_aligned ((__m128i*)dst);
4377
4378             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4379             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4380
4381             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4382                                 &xmm_dst_lo, &xmm_dst_hi,
4383                                 &xmm_dst_lo, &xmm_dst_hi);
4384
4385             save_128_aligned (
4386                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4387
4388             src += 16;
4389             dst += 16;
4390             w -= 16;
4391         }
4392
4393         while (w)
4394         {
4395             s = (uint32_t) *src++;
4396             d = (uint32_t) *dst;
4397
4398             *dst++ = (uint8_t) pack_1x128_32 (
4399                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4400             w--;
4401         }
4402     }
4403
4404 }
4405
4406 static void
4407 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4408                           pixman_op_t              op,
4409                           pixman_image_t *         src_image,
4410                           pixman_image_t *         mask_image,
4411                           pixman_image_t *         dst_image,
4412                           int32_t                  src_x,
4413                           int32_t                  src_y,
4414                           int32_t                  mask_x,
4415                           int32_t                  mask_y,
4416                           int32_t                  dest_x,
4417                           int32_t                  dest_y,
4418                           int32_t                  width,
4419                           int32_t                  height)
4420 {
4421     uint8_t     *dst_line, *dst;
4422     uint8_t     *mask_line, *mask;
4423     int dst_stride, mask_stride;
4424     int32_t w;
4425     uint32_t src;
4426     uint8_t sa;
4427     uint32_t m, d;
4428
4429     __m128i xmm_alpha;
4430     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4431     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4432
4433     PIXMAN_IMAGE_GET_LINE (
4434         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4435     PIXMAN_IMAGE_GET_LINE (
4436         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4437
4438     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4439
4440     sa = src >> 24;
4441
4442     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4443
4444     while (height--)
4445     {
4446         dst = dst_line;
4447         dst_line += dst_stride;
4448         mask = mask_line;
4449         mask_line += mask_stride;
4450         w = width;
4451
4452         while (w && ((unsigned long)dst & 15))
4453         {
4454             m = (uint32_t) *mask++;
4455             d = (uint32_t) *dst;
4456
4457             *dst++ = (uint8_t) pack_1x128_32 (
4458                 _mm_adds_epu16 (
4459                     pix_multiply_1x128 (
4460                         xmm_alpha, unpack_32_1x128 (m)),
4461                     unpack_32_1x128 (d)));
4462             w--;
4463         }
4464
4465         while (w >= 16)
4466         {
4467             xmm_mask = load_128_unaligned ((__m128i*)mask);
4468             xmm_dst = load_128_aligned ((__m128i*)dst);
4469
4470             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4471             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4472
4473             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4474                                 &xmm_mask_lo, &xmm_mask_hi,
4475                                 &xmm_mask_lo, &xmm_mask_hi);
4476
4477             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4478             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4479
4480             save_128_aligned (
4481                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4482
4483             mask += 16;
4484             dst += 16;
4485             w -= 16;
4486         }
4487
4488         while (w)
4489         {
4490             m = (uint32_t) *mask++;
4491             d = (uint32_t) *dst;
4492
4493             *dst++ = (uint8_t) pack_1x128_32 (
4494                 _mm_adds_epu16 (
4495                     pix_multiply_1x128 (
4496                         xmm_alpha, unpack_32_1x128 (m)),
4497                     unpack_32_1x128 (d)));
4498
4499             w--;
4500         }
4501     }
4502
4503 }
4504
4505 static void
4506 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4507                         pixman_op_t              op,
4508                         pixman_image_t *         src_image,
4509                         pixman_image_t *         mask_image,
4510                         pixman_image_t *         dst_image,
4511                         int32_t                  src_x,
4512                         int32_t                  src_y,
4513                         int32_t                  mask_x,
4514                         int32_t                  mask_y,
4515                         int32_t                  dest_x,
4516                         int32_t                  dest_y,
4517                         int32_t                  width,
4518                         int32_t                  height)
4519 {
4520     uint8_t     *dst_line, *dst;
4521     int dst_stride;
4522     int32_t w;
4523     uint32_t src;
4524
4525     __m128i xmm_src;
4526
4527     PIXMAN_IMAGE_GET_LINE (
4528         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4529
4530     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4531
4532     src >>= 24;
4533
4534     if (src == 0x00)
4535         return;
4536
4537     if (src == 0xff)
4538     {
4539         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4540                      8, dest_x, dest_y, width, height, 0xff);
4541
4542         return;
4543     }
4544
4545     src = (src << 24) | (src << 16) | (src << 8) | src;
4546     xmm_src = _mm_set_epi32 (src, src, src, src);
4547
4548     while (height--)
4549     {
4550         dst = dst_line;
4551         dst_line += dst_stride;
4552         w = width;
4553
4554         while (w && ((unsigned long)dst & 15))
4555         {
4556             *dst = (uint8_t)_mm_cvtsi128_si32 (
4557                 _mm_adds_epu8 (
4558                     xmm_src,
4559                     _mm_cvtsi32_si128 (*dst)));
4560
4561             w--;
4562             dst++;
4563         }
4564
4565         while (w >= 16)
4566         {
4567             save_128_aligned (
4568                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4569
4570             dst += 16;
4571             w -= 16;
4572         }
4573
4574         while (w)
4575         {
4576             *dst = (uint8_t)_mm_cvtsi128_si32 (
4577                 _mm_adds_epu8 (
4578                     xmm_src,
4579                     _mm_cvtsi32_si128 (*dst)));
4580
4581             w--;
4582             dst++;
4583         }
4584     }
4585
4586 }
4587
4588 static void
4589 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4590                         pixman_op_t              op,
4591                         pixman_image_t *         src_image,
4592                         pixman_image_t *         mask_image,
4593                         pixman_image_t *         dst_image,
4594                         int32_t                  src_x,
4595                         int32_t                  src_y,
4596                         int32_t                  mask_x,
4597                         int32_t                  mask_y,
4598                         int32_t                  dest_x,
4599                         int32_t                  dest_y,
4600                         int32_t                  width,
4601                         int32_t                  height)
4602 {
4603     uint8_t     *dst_line, *dst;
4604     uint8_t     *src_line, *src;
4605     int dst_stride, src_stride;
4606     int32_t w;
4607     uint16_t t;
4608
4609     PIXMAN_IMAGE_GET_LINE (
4610         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4611     PIXMAN_IMAGE_GET_LINE (
4612         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4613
4614     while (height--)
4615     {
4616         dst = dst_line;
4617         src = src_line;
4618
4619         dst_line += dst_stride;
4620         src_line += src_stride;
4621         w = width;
4622
4623         /* Small head */
4624         while (w && (unsigned long)dst & 3)
4625         {
4626             t = (*dst) + (*src++);
4627             *dst++ = t | (0 - (t >> 8));
4628             w--;
4629         }
4630
4631         sse2_combine_add_u (imp, op,
4632                             (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4633
4634         /* Small tail */
4635         dst += w & 0xfffc;
4636         src += w & 0xfffc;
4637
4638         w &= 3;
4639
4640         while (w)
4641         {
4642             t = (*dst) + (*src++);
4643             *dst++ = t | (0 - (t >> 8));
4644             w--;
4645         }
4646     }
4647
4648 }
4649
4650 static void
4651 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4652                               pixman_op_t              op,
4653                               pixman_image_t *         src_image,
4654                               pixman_image_t *         mask_image,
4655                               pixman_image_t *         dst_image,
4656                               int32_t                  src_x,
4657                               int32_t                  src_y,
4658                               int32_t                  mask_x,
4659                               int32_t                  mask_y,
4660                               int32_t                  dest_x,
4661                               int32_t                  dest_y,
4662                               int32_t                  width,
4663                               int32_t                  height)
4664 {
4665     uint32_t    *dst_line, *dst;
4666     uint32_t    *src_line, *src;
4667     int dst_stride, src_stride;
4668
4669     PIXMAN_IMAGE_GET_LINE (
4670         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4671     PIXMAN_IMAGE_GET_LINE (
4672         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4673
4674     while (height--)
4675     {
4676         dst = dst_line;
4677         dst_line += dst_stride;
4678         src = src_line;
4679         src_line += src_stride;
4680
4681         sse2_combine_add_u (imp, op, dst, src, NULL, width);
4682     }
4683
4684 }
4685
4686 static pixman_bool_t
4687 pixman_blt_sse2 (uint32_t *src_bits,
4688                  uint32_t *dst_bits,
4689                  int       src_stride,
4690                  int       dst_stride,
4691                  int       src_bpp,
4692                  int       dst_bpp,
4693                  int       src_x,
4694                  int       src_y,
4695                  int       dst_x,
4696                  int       dst_y,
4697                  int       width,
4698                  int       height)
4699 {
4700     uint8_t *   src_bytes;
4701     uint8_t *   dst_bytes;
4702     int byte_width;
4703
4704     if (src_bpp != dst_bpp)
4705         return FALSE;
4706
4707     if (src_bpp == 16)
4708     {
4709         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4710         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4711         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4712         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4713         byte_width = 2 * width;
4714         src_stride *= 2;
4715         dst_stride *= 2;
4716     }
4717     else if (src_bpp == 32)
4718     {
4719         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4720         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4721         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4722         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4723         byte_width = 4 * width;
4724         src_stride *= 4;
4725         dst_stride *= 4;
4726     }
4727     else
4728     {
4729         return FALSE;
4730     }
4731
4732     while (height--)
4733     {
4734         int w;
4735         uint8_t *s = src_bytes;
4736         uint8_t *d = dst_bytes;
4737         src_bytes += src_stride;
4738         dst_bytes += dst_stride;
4739         w = byte_width;
4740
4741         while (w >= 2 && ((unsigned long)d & 3))
4742         {
4743             *(uint16_t *)d = *(uint16_t *)s;
4744             w -= 2;
4745             s += 2;
4746             d += 2;
4747         }
4748
4749         while (w >= 4 && ((unsigned long)d & 15))
4750         {
4751             *(uint32_t *)d = *(uint32_t *)s;
4752
4753             w -= 4;
4754             s += 4;
4755             d += 4;
4756         }
4757
4758         while (w >= 64)
4759         {
4760             __m128i xmm0, xmm1, xmm2, xmm3;
4761
4762             xmm0 = load_128_unaligned ((__m128i*)(s));
4763             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4764             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4765             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4766
4767             save_128_aligned ((__m128i*)(d),    xmm0);
4768             save_128_aligned ((__m128i*)(d + 16), xmm1);
4769             save_128_aligned ((__m128i*)(d + 32), xmm2);
4770             save_128_aligned ((__m128i*)(d + 48), xmm3);
4771
4772             s += 64;
4773             d += 64;
4774             w -= 64;
4775         }
4776
4777         while (w >= 16)
4778         {
4779             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4780
4781             w -= 16;
4782             d += 16;
4783             s += 16;
4784         }
4785
4786         while (w >= 4)
4787         {
4788             *(uint32_t *)d = *(uint32_t *)s;
4789
4790             w -= 4;
4791             s += 4;
4792             d += 4;
4793         }
4794
4795         if (w >= 2)
4796         {
4797             *(uint16_t *)d = *(uint16_t *)s;
4798             w -= 2;
4799             s += 2;
4800             d += 2;
4801         }
4802     }
4803
4804
4805     return TRUE;
4806 }
4807
4808 static void
4809 sse2_composite_copy_area (pixman_implementation_t *imp,
4810                           pixman_op_t              op,
4811                           pixman_image_t *         src_image,
4812                           pixman_image_t *         mask_image,
4813                           pixman_image_t *         dst_image,
4814                           int32_t                  src_x,
4815                           int32_t                  src_y,
4816                           int32_t                  mask_x,
4817                           int32_t                  mask_y,
4818                           int32_t                  dest_x,
4819                           int32_t                  dest_y,
4820                           int32_t                  width,
4821                           int32_t                  height)
4822 {
4823     pixman_blt_sse2 (src_image->bits.bits,
4824                      dst_image->bits.bits,
4825                      src_image->bits.rowstride,
4826                      dst_image->bits.rowstride,
4827                      PIXMAN_FORMAT_BPP (src_image->bits.format),
4828                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
4829                      src_x, src_y, dest_x, dest_y, width, height);
4830 }
4831
4832 static void
4833 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4834                                  pixman_op_t              op,
4835                                  pixman_image_t *         src_image,
4836                                  pixman_image_t *         mask_image,
4837                                  pixman_image_t *         dst_image,
4838                                  int32_t                  src_x,
4839                                  int32_t                  src_y,
4840                                  int32_t                  mask_x,
4841                                  int32_t                  mask_y,
4842                                  int32_t                  dest_x,
4843                                  int32_t                  dest_y,
4844                                  int32_t                  width,
4845                                  int32_t                  height)
4846 {
4847     uint32_t    *src, *src_line, s;
4848     uint32_t    *dst, *dst_line, d;
4849     uint8_t         *mask, *mask_line;
4850     uint32_t m;
4851     int src_stride, mask_stride, dst_stride;
4852     int32_t w;
4853     __m128i ms;
4854
4855     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4856     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4857     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4858
4859     PIXMAN_IMAGE_GET_LINE (
4860         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4861     PIXMAN_IMAGE_GET_LINE (
4862         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4863     PIXMAN_IMAGE_GET_LINE (
4864         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4865
4866     while (height--)
4867     {
4868         src = src_line;
4869         src_line += src_stride;
4870         dst = dst_line;
4871         dst_line += dst_stride;
4872         mask = mask_line;
4873         mask_line += mask_stride;
4874
4875         w = width;
4876
4877         while (w && (unsigned long)dst & 15)
4878         {
4879             s = 0xff000000 | *src++;
4880             m = (uint32_t) *mask++;
4881             d = *dst;
4882             ms = unpack_32_1x128 (s);
4883
4884             if (m != 0xff)
4885             {
4886                 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4887                 __m128i md = unpack_32_1x128 (d);
4888
4889                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4890             }
4891
4892             *dst++ = pack_1x128_32 (ms);
4893             w--;
4894         }
4895
4896         while (w >= 4)
4897         {
4898             m = *(uint32_t*) mask;
4899             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
4900
4901             if (m == 0xffffffff)
4902             {
4903                 save_128_aligned ((__m128i*)dst, xmm_src);
4904             }
4905             else
4906             {
4907                 xmm_dst = load_128_aligned ((__m128i*)dst);
4908
4909                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4910
4911                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4912                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4913                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4914
4915                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4916
4917                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
4918
4919                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4920             }
4921
4922             src += 4;
4923             dst += 4;
4924             mask += 4;
4925             w -= 4;
4926         }
4927
4928         while (w)
4929         {
4930             m = (uint32_t) *mask++;
4931
4932             if (m)
4933             {
4934                 s = 0xff000000 | *src;
4935
4936                 if (m == 0xff)
4937                 {
4938                     *dst = s;
4939                 }
4940                 else
4941                 {
4942                     __m128i ma, md, ms;
4943
4944                     d = *dst;
4945
4946                     ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4947                     md = unpack_32_1x128 (d);
4948                     ms = unpack_32_1x128 (s);
4949
4950                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4951                 }
4952
4953             }
4954
4955             src++;
4956             dst++;
4957             w--;
4958         }
4959     }
4960
4961 }
4962
4963 static void
4964 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4965                                  pixman_op_t              op,
4966                                  pixman_image_t *         src_image,
4967                                  pixman_image_t *         mask_image,
4968                                  pixman_image_t *         dst_image,
4969                                  int32_t                  src_x,
4970                                  int32_t                  src_y,
4971                                  int32_t                  mask_x,
4972                                  int32_t                  mask_y,
4973                                  int32_t                  dest_x,
4974                                  int32_t                  dest_y,
4975                                  int32_t                  width,
4976                                  int32_t                  height)
4977 {
4978     uint32_t    *src, *src_line, s;
4979     uint32_t    *dst, *dst_line, d;
4980     uint8_t         *mask, *mask_line;
4981     uint32_t m;
4982     int src_stride, mask_stride, dst_stride;
4983     int32_t w;
4984
4985     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4986     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4987     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4988
4989     PIXMAN_IMAGE_GET_LINE (
4990         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4991     PIXMAN_IMAGE_GET_LINE (
4992         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4993     PIXMAN_IMAGE_GET_LINE (
4994         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4995
4996     while (height--)
4997     {
4998         src = src_line;
4999         src_line += src_stride;
5000         dst = dst_line;
5001         dst_line += dst_stride;
5002         mask = mask_line;
5003         mask_line += mask_stride;
5004
5005         w = width;
5006
5007         while (w && (unsigned long)dst & 15)
5008         {
5009             uint32_t sa;
5010
5011             s = *src++;
5012             m = (uint32_t) *mask++;
5013             d = *dst;
5014
5015             sa = s >> 24;
5016
5017             if (m)
5018             {
5019                 if (sa == 0xff && m == 0xff)
5020                 {
5021                     *dst = s;
5022                 }
5023                 else
5024                 {
5025                     __m128i ms, md, ma, msa;
5026
5027                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5028                     ms = unpack_32_1x128 (s);
5029                     md = unpack_32_1x128 (d);
5030
5031                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5032
5033                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5034                 }
5035             }
5036
5037             dst++;
5038             w--;
5039         }
5040
5041         while (w >= 4)
5042         {
5043             m = *(uint32_t *) mask;
5044
5045             if (m)
5046             {
5047                 xmm_src = load_128_unaligned ((__m128i*)src);
5048
5049                 if (m == 0xffffffff && is_opaque (xmm_src))
5050                 {
5051                     save_128_aligned ((__m128i *)dst, xmm_src);
5052                 }
5053                 else
5054                 {
5055                     xmm_dst = load_128_aligned ((__m128i *)dst);
5056
5057                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5058
5059                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5060                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5061                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5062
5063                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5064                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5065
5066                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5067                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5068
5069                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5070                 }
5071             }
5072
5073             src += 4;
5074             dst += 4;
5075             mask += 4;
5076             w -= 4;
5077         }
5078
5079         while (w)
5080         {
5081             uint32_t sa;
5082
5083             s = *src++;
5084             m = (uint32_t) *mask++;
5085             d = *dst;
5086
5087             sa = s >> 24;
5088
5089             if (m)
5090             {
5091                 if (sa == 0xff && m == 0xff)
5092                 {
5093                     *dst = s;
5094                 }
5095                 else
5096                 {
5097                     __m128i ms, md, ma, msa;
5098
5099                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5100                     ms = unpack_32_1x128 (s);
5101                     md = unpack_32_1x128 (d);
5102
5103                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5104
5105                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5106                 }
5107             }
5108
5109             dst++;
5110             w--;
5111         }
5112     }
5113
5114 }
5115
5116 static void
5117 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5118                                     pixman_op_t              op,
5119                                     pixman_image_t *         src_image,
5120                                     pixman_image_t *         mask_image,
5121                                     pixman_image_t *         dst_image,
5122                                     int32_t                  src_x,
5123                                     int32_t                  src_y,
5124                                     int32_t                  mask_x,
5125                                     int32_t                  mask_y,
5126                                     int32_t                  dest_x,
5127                                     int32_t                  dest_y,
5128                                     int32_t                  width,
5129                                     int32_t                  height)
5130 {
5131     uint32_t src;
5132     uint32_t    *dst_line, *dst;
5133     __m128i xmm_src;
5134     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5135     __m128i xmm_dsta_hi, xmm_dsta_lo;
5136     int dst_stride;
5137     int32_t w;
5138
5139     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5140
5141     if (src == 0)
5142         return;
5143
5144     PIXMAN_IMAGE_GET_LINE (
5145         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5146
5147     xmm_src = expand_pixel_32_1x128 (src);
5148
5149     while (height--)
5150     {
5151         dst = dst_line;
5152
5153         dst_line += dst_stride;
5154         w = width;
5155
5156         while (w && (unsigned long)dst & 15)
5157         {
5158             __m128i vd;
5159
5160             vd = unpack_32_1x128 (*dst);
5161
5162             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5163                                               xmm_src));
5164             w--;
5165             dst++;
5166         }
5167
5168         while (w >= 4)
5169         {
5170             __m128i tmp_lo, tmp_hi;
5171
5172             xmm_dst = load_128_aligned ((__m128i*)dst);
5173
5174             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5175             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5176
5177             tmp_lo = xmm_src;
5178             tmp_hi = xmm_src;
5179
5180             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5181                         &xmm_dsta_lo, &xmm_dsta_hi,
5182                         &tmp_lo, &tmp_hi);
5183
5184             save_128_aligned (
5185                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5186
5187             w -= 4;
5188             dst += 4;
5189         }
5190
5191         while (w)
5192         {
5193             __m128i vd;
5194
5195             vd = unpack_32_1x128 (*dst);
5196
5197             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5198                                               xmm_src));
5199             w--;
5200             dst++;
5201         }
5202
5203     }
5204
5205 }
5206
5207 static void
5208 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5209                                     pixman_op_t              op,
5210                                     pixman_image_t *         src_image,
5211                                     pixman_image_t *         mask_image,
5212                                     pixman_image_t *         dst_image,
5213                                     int32_t                  src_x,
5214                                     int32_t                  src_y,
5215                                     int32_t                  mask_x,
5216                                     int32_t                  mask_y,
5217                                     int32_t                  dest_x,
5218                                     int32_t                  dest_y,
5219                                     int32_t                  width,
5220                                     int32_t                  height)
5221 {
5222     uint32_t    *src, *src_line, s;
5223     uint32_t    *dst, *dst_line, d;
5224     uint32_t    *mask, *mask_line;
5225     uint32_t    m;
5226     int src_stride, mask_stride, dst_stride;
5227     int32_t w;
5228
5229     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5230     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5231     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5232
5233     PIXMAN_IMAGE_GET_LINE (
5234         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5235     PIXMAN_IMAGE_GET_LINE (
5236         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5237     PIXMAN_IMAGE_GET_LINE (
5238         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5239
5240     while (height--)
5241     {
5242         src = src_line;
5243         src_line += src_stride;
5244         dst = dst_line;
5245         dst_line += dst_stride;
5246         mask = mask_line;
5247         mask_line += mask_stride;
5248
5249         w = width;
5250
5251         while (w && (unsigned long)dst & 15)
5252         {
5253             uint32_t sa;
5254
5255             s = *src++;
5256             m = (*mask++) >> 24;
5257             d = *dst;
5258
5259             sa = s >> 24;
5260
5261             if (m)
5262             {
5263                 if (sa == 0xff && m == 0xff)
5264                 {
5265                     *dst = s;
5266                 }
5267                 else
5268                 {
5269                     __m128i ms, md, ma, msa;
5270
5271                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5272                     ms = unpack_32_1x128 (s);
5273                     md = unpack_32_1x128 (d);
5274
5275                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5276
5277                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5278                 }
5279             }
5280
5281             dst++;
5282             w--;
5283         }
5284
5285         while (w >= 4)
5286         {
5287             xmm_mask = load_128_unaligned ((__m128i*)mask);
5288
5289             if (!is_transparent (xmm_mask))
5290             {
5291                 xmm_src = load_128_unaligned ((__m128i*)src);
5292
5293                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5294                 {
5295                     save_128_aligned ((__m128i *)dst, xmm_src);
5296                 }
5297                 else
5298                 {
5299                     xmm_dst = load_128_aligned ((__m128i *)dst);
5300
5301                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5302                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5303                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5304
5305                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5306                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5307
5308                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5309                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5310
5311                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5312                 }
5313             }
5314
5315             src += 4;
5316             dst += 4;
5317             mask += 4;
5318             w -= 4;
5319         }
5320
5321         while (w)
5322         {
5323             uint32_t sa;
5324
5325             s = *src++;
5326             m = (*mask++) >> 24;
5327             d = *dst;
5328
5329             sa = s >> 24;
5330
5331             if (m)
5332             {
5333                 if (sa == 0xff && m == 0xff)
5334                 {
5335                     *dst = s;
5336                 }
5337                 else
5338                 {
5339                     __m128i ms, md, ma, msa;
5340
5341                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5342                     ms = unpack_32_1x128 (s);
5343                     md = unpack_32_1x128 (d);
5344
5345                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5346
5347                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5348                 }
5349             }
5350
5351             dst++;
5352             w--;
5353         }
5354     }
5355
5356 }
5357
5358 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5359 static force_inline void
5360 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5361                                              const uint32_t* ps,
5362                                              int32_t         w,
5363                                              pixman_fixed_t  vx,
5364                                              pixman_fixed_t  unit_x,
5365                                              pixman_fixed_t  max_vx,
5366                                              pixman_bool_t   fully_transparent_src)
5367 {
5368     uint32_t s, d;
5369     const uint32_t* pm = NULL;
5370
5371     __m128i xmm_dst_lo, xmm_dst_hi;
5372     __m128i xmm_src_lo, xmm_src_hi;
5373     __m128i xmm_alpha_lo, xmm_alpha_hi;
5374
5375     if (fully_transparent_src)
5376         return;
5377
5378     /* Align dst on a 16-byte boundary */
5379     while (w && ((unsigned long)pd & 15))
5380     {
5381         d = *pd;
5382         s = combine1 (ps + (vx >> 16), pm);
5383         vx += unit_x;
5384
5385         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5386         if (pm)
5387             pm++;
5388         w--;
5389     }
5390
5391     while (w >= 4)
5392     {
5393         __m128i tmp;
5394         uint32_t tmp1, tmp2, tmp3, tmp4;
5395
5396         tmp1 = ps[vx >> 16];
5397         vx += unit_x;
5398         tmp2 = ps[vx >> 16];
5399         vx += unit_x;
5400         tmp3 = ps[vx >> 16];
5401         vx += unit_x;
5402         tmp4 = ps[vx >> 16];
5403         vx += unit_x;
5404
5405         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5406
5407         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5408
5409         if (is_opaque (xmm_src_hi))
5410         {
5411             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5412         }
5413         else if (!is_zero (xmm_src_hi))
5414         {
5415             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5416
5417             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5418             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5419
5420             expand_alpha_2x128 (
5421                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5422
5423             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5424                         &xmm_alpha_lo, &xmm_alpha_hi,
5425                         &xmm_dst_lo, &xmm_dst_hi);
5426
5427             /* rebuid the 4 pixel data and save*/
5428             save_128_aligned ((__m128i*)pd,
5429                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5430         }
5431
5432         w -= 4;
5433         pd += 4;
5434         if (pm)
5435             pm += 4;
5436     }
5437
5438     while (w)
5439     {
5440         d = *pd;
5441         s = combine1 (ps + (vx >> 16), pm);
5442         vx += unit_x;
5443
5444         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5445         if (pm)
5446             pm++;
5447
5448         w--;
5449     }
5450 }
5451
5452 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5453                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5454                        uint32_t, uint32_t, COVER)
5455 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5456                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5457                        uint32_t, uint32_t, NONE)
5458 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5459                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5460                        uint32_t, uint32_t, PAD)
5461
5462 static force_inline void
5463 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5464                                                uint32_t *       dst,
5465                                                const uint32_t * src,
5466                                                int32_t          w,
5467                                                pixman_fixed_t   vx,
5468                                                pixman_fixed_t   unit_x,
5469                                                pixman_fixed_t   max_vx,
5470                                                pixman_bool_t    zero_src)
5471 {
5472     __m128i xmm_mask;
5473     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5474     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5475     __m128i xmm_alpha_lo, xmm_alpha_hi;
5476
5477     if (zero_src || (*mask >> 24) == 0)
5478         return;
5479
5480     xmm_mask = create_mask_16_128 (*mask >> 24);
5481
5482     while (w && (unsigned long)dst & 15)
5483     {
5484         uint32_t s = src[pixman_fixed_to_int (vx)];
5485         vx += unit_x;
5486
5487         if (s)
5488         {
5489             uint32_t d = *dst;
5490
5491             __m128i ms = unpack_32_1x128 (s);
5492             __m128i alpha     = expand_alpha_1x128 (ms);
5493             __m128i dest      = xmm_mask;
5494             __m128i alpha_dst = unpack_32_1x128 (d);
5495
5496             *dst = pack_1x128_32 (
5497                 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5498         }
5499         dst++;
5500         w--;
5501     }
5502
5503     while (w >= 4)
5504     {
5505         uint32_t tmp1, tmp2, tmp3, tmp4;
5506
5507         tmp1 = src[pixman_fixed_to_int (vx)];
5508         vx += unit_x;
5509         tmp2 = src[pixman_fixed_to_int (vx)];
5510         vx += unit_x;
5511         tmp3 = src[pixman_fixed_to_int (vx)];
5512         vx += unit_x;
5513         tmp4 = src[pixman_fixed_to_int (vx)];
5514         vx += unit_x;
5515
5516         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5517
5518         if (!is_zero (xmm_src))
5519         {
5520             xmm_dst = load_128_aligned ((__m128i*)dst);
5521
5522             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5523             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5524             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5525                                 &xmm_alpha_lo, &xmm_alpha_hi);
5526
5527             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5528                            &xmm_alpha_lo, &xmm_alpha_hi,
5529                            &xmm_mask, &xmm_mask,
5530                            &xmm_dst_lo, &xmm_dst_hi);
5531
5532             save_128_aligned (
5533                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5534         }
5535
5536         dst += 4;
5537         w -= 4;
5538     }
5539
5540     while (w)
5541     {
5542         uint32_t s = src[pixman_fixed_to_int (vx)];
5543         vx += unit_x;
5544
5545         if (s)
5546         {
5547             uint32_t d = *dst;
5548
5549             __m128i ms = unpack_32_1x128 (s);
5550             __m128i alpha = expand_alpha_1x128 (ms);
5551             __m128i mask  = xmm_mask;
5552             __m128i dest  = unpack_32_1x128 (d);
5553
5554             *dst = pack_1x128_32 (
5555                 in_over_1x128 (&ms, &alpha, &mask, &dest));
5556         }
5557
5558         dst++;
5559         w--;
5560     }
5561
5562 }
5563
5564 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5565                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5566                               uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5567 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5568                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5569                               uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5570 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5571                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5572                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5573
5574 static const pixman_fast_path_t sse2_fast_paths[] =
5575 {
5576     /* PIXMAN_OP_OVER */
5577     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5578     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5579     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5580     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5581     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5582     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5583     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5584     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5585     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5586     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5587     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5588     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5589     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5590     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5591     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5592     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5593     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5594     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5595     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5596     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5597     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5598     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5599     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5600     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5601     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5602     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5603     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5604     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5605     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5606     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5607     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5608     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5609     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5610     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5611     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5612     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5613     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5614     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5615     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5616     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5617     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5618     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5619     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5620     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5621     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5622     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5623     
5624     /* PIXMAN_OP_OVER_REVERSE */
5625     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5626     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5627
5628     /* PIXMAN_OP_ADD */
5629     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5630     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5631     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5632     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5633     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5634     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5635
5636     /* PIXMAN_OP_SRC */
5637     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5638     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5639     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5640     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5641     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5642     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5643     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5644     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5645     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5646     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5647     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5648     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5649     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5650     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5651
5652     /* PIXMAN_OP_IN */
5653     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5654     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5655     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5656
5657     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5658     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5659     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5660     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5661     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5662     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5663     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5664     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5665     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5666     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5667     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5668     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5669
5670     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
5671     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
5672     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
5673     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
5674
5675     { PIXMAN_OP_NONE },
5676 };
5677
5678 static pixman_bool_t
5679 sse2_blt (pixman_implementation_t *imp,
5680           uint32_t *               src_bits,
5681           uint32_t *               dst_bits,
5682           int                      src_stride,
5683           int                      dst_stride,
5684           int                      src_bpp,
5685           int                      dst_bpp,
5686           int                      src_x,
5687           int                      src_y,
5688           int                      dst_x,
5689           int                      dst_y,
5690           int                      width,
5691           int                      height)
5692 {
5693     if (!pixman_blt_sse2 (
5694             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5695             src_x, src_y, dst_x, dst_y, width, height))
5696
5697     {
5698         return _pixman_implementation_blt (
5699             imp->delegate,
5700             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5701             src_x, src_y, dst_x, dst_y, width, height);
5702     }
5703
5704     return TRUE;
5705 }
5706
5707 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5708 __attribute__((__force_align_arg_pointer__))
5709 #endif
5710 static pixman_bool_t
5711 sse2_fill (pixman_implementation_t *imp,
5712            uint32_t *               bits,
5713            int                      stride,
5714            int                      bpp,
5715            int                      x,
5716            int                      y,
5717            int                      width,
5718            int                      height,
5719            uint32_t xor)
5720 {
5721     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5722     {
5723         return _pixman_implementation_fill (
5724             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5725     }
5726
5727     return TRUE;
5728 }
5729
5730 static uint32_t *
5731 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5732 {
5733     int w = iter->width;
5734     __m128i ff000000 = mask_ff000000;
5735     uint32_t *dst = iter->buffer;
5736     uint32_t *src = (uint32_t *)iter->bits;
5737
5738     iter->bits += iter->stride;
5739
5740     while (w && ((unsigned long)dst) & 0x0f)
5741     {
5742         *dst++ = (*src++) | 0xff000000;
5743         w--;
5744     }
5745
5746     while (w >= 4)
5747     {
5748         save_128_aligned (
5749             (__m128i *)dst, _mm_or_si128 (
5750                 load_128_unaligned ((__m128i *)src), ff000000));
5751
5752         dst += 4;
5753         src += 4;
5754         w -= 4;
5755     }
5756
5757     while (w)
5758     {
5759         *dst++ = (*src++) | 0xff000000;
5760         w--;
5761     }
5762
5763     return iter->buffer;
5764 }
5765
5766 static uint32_t *
5767 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
5768 {
5769     int w = iter->width;
5770     uint32_t *dst = iter->buffer;
5771     uint16_t *src = (uint16_t *)iter->bits;
5772     __m128i ff000000 = mask_ff000000;
5773
5774     iter->bits += iter->stride;
5775
5776     while (w && ((unsigned long)dst) & 0x0f)
5777     {
5778         uint16_t s = *src++;
5779
5780         *dst++ = CONVERT_0565_TO_8888 (s);
5781         w--;
5782     }
5783
5784     while (w >= 8)
5785     {
5786         __m128i lo, hi, s;
5787
5788         s = _mm_loadu_si128 ((__m128i *)src);
5789
5790         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
5791         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
5792
5793         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
5794         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
5795
5796         dst += 8;
5797         src += 8;
5798         w -= 8;
5799     }
5800
5801     while (w)
5802     {
5803         uint16_t s = *src++;
5804
5805         *dst++ = CONVERT_0565_TO_8888 (s);
5806         w--;
5807     }
5808
5809     return iter->buffer;
5810 }
5811
5812 static uint32_t *
5813 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5814 {
5815     int w = iter->width;
5816     uint32_t *dst = iter->buffer;
5817     uint8_t *src = iter->bits;
5818     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5819
5820     iter->bits += iter->stride;
5821
5822     while (w && (((unsigned long)dst) & 15))
5823     {
5824         *dst++ = *(src++) << 24;
5825         w--;
5826     }
5827
5828     while (w >= 16)
5829     {
5830         xmm0 = _mm_loadu_si128((__m128i *)src);
5831
5832         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
5833         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
5834         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
5835         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
5836         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
5837         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
5838
5839         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
5840         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
5841         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
5842         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
5843
5844         dst += 16;
5845         src += 16;
5846         w -= 16;
5847     }
5848
5849     while (w)
5850     {
5851         *dst++ = *(src++) << 24;
5852         w--;
5853     }
5854
5855     return iter->buffer;
5856 }
5857
5858 typedef struct
5859 {
5860     pixman_format_code_t        format;
5861     pixman_iter_get_scanline_t  get_scanline;
5862 } fetcher_info_t;
5863
5864 static const fetcher_info_t fetchers[] =
5865 {
5866     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
5867     { PIXMAN_r5g6b5,            sse2_fetch_r5g6b5 },
5868     { PIXMAN_a8,                sse2_fetch_a8 },
5869     { PIXMAN_null }
5870 };
5871
5872 static void
5873 sse2_src_iter_init (pixman_implementation_t *imp,
5874                     pixman_iter_t *iter,
5875                     pixman_image_t *image,
5876                     int x, int y, int width, int height,
5877                     uint8_t *buffer, iter_flags_t flags)
5878 {
5879 #define FLAGS                                                           \
5880     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
5881
5882     if ((flags & ITER_NARROW)                           &&
5883         (image->common.flags & FLAGS) == FLAGS          &&
5884         x >= 0 && y >= 0                                &&
5885         x + width <= image->bits.width                  &&
5886         y + height <= image->bits.height)
5887     {
5888         const fetcher_info_t *f;
5889
5890         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
5891         {
5892             if (image->common.extended_format_code == f->format)
5893             {
5894                 uint8_t *b = (uint8_t *)image->bits.bits;
5895                 int s = image->bits.rowstride * 4;
5896
5897                 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
5898                 iter->stride = s;
5899                 iter->width = width;
5900                 iter->buffer = (uint32_t *)buffer;
5901
5902                 iter->get_scanline = f->get_scanline;
5903                 return;
5904             }
5905         }
5906     }
5907
5908     _pixman_implementation_src_iter_init (
5909         imp->delegate, iter, image, x, y, width, height, buffer, flags);
5910 }
5911
5912 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5913 __attribute__((__force_align_arg_pointer__))
5914 #endif
5915 pixman_implementation_t *
5916 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
5917 {
5918     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
5919
5920     /* SSE2 constants */
5921     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5922     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5923     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5924     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5925     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5926     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5927     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5928     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5929     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
5930     mask_0080 = create_mask_16_128 (0x0080);
5931     mask_00ff = create_mask_16_128 (0x00ff);
5932     mask_0101 = create_mask_16_128 (0x0101);
5933     mask_ffff = create_mask_16_128 (0xffff);
5934     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5935     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5936
5937     /* Set up function pointers */
5938     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5939     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5940     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5941     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5942     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5943     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5944     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5945     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5946     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5947     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5948
5949     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5950
5951     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5952     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5953     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5954     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5955     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5956     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5957     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5958     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5959     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5960     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5961     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5962
5963     imp->blt = sse2_blt;
5964     imp->fill = sse2_fill;
5965
5966     imp->src_iter_init = sse2_src_iter_init;
5967
5968     return imp;
5969 }