sse2: Remove all the core_combine_* functions
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38 #include "pixman-fast-path.h"
39
40 #if defined(_MSC_VER) && defined(_M_AMD64)
41 /* Windows 64 doesn't allow MMX to be used, so
42  * the pixman-x64-mmx-emulation.h file contains
43  * implementations of those MMX intrinsics that
44  * are used in the SSE2 implementation.
45  */
46 #   include "pixman-x64-mmx-emulation.h"
47 #endif
48
49 #ifdef USE_SSE2
50
51 /* --------------------------------------------------------------------
52  * Locals
53  */
54
55 static __m128i mask_0080;
56 static __m128i mask_00ff;
57 static __m128i mask_0101;
58 static __m128i mask_ffff;
59 static __m128i mask_ff000000;
60 static __m128i mask_alpha;
61
62 static __m128i mask_565_r;
63 static __m128i mask_565_g1, mask_565_g2;
64 static __m128i mask_565_b;
65 static __m128i mask_red;
66 static __m128i mask_green;
67 static __m128i mask_blue;
68
69 static __m128i mask_565_fix_rb;
70 static __m128i mask_565_fix_g;
71
72 /* ----------------------------------------------------------------------
73  * SSE2 Inlines
74  */
75 static force_inline __m128i
76 unpack_32_1x128 (uint32_t data)
77 {
78     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
79 }
80
81 static force_inline void
82 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
83 {
84     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
85     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
86 }
87
88 static force_inline __m128i
89 unpack_565_to_8888 (__m128i lo)
90 {
91     __m128i r, g, b, rb, t;
92
93     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
94     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
95     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
96
97     rb = _mm_or_si128 (r, b);
98     t  = _mm_and_si128 (rb, mask_565_fix_rb);
99     t  = _mm_srli_epi32 (t, 5);
100     rb = _mm_or_si128 (rb, t);
101
102     t  = _mm_and_si128 (g, mask_565_fix_g);
103     t  = _mm_srli_epi32 (t, 6);
104     g  = _mm_or_si128 (g, t);
105
106     return _mm_or_si128 (rb, g);
107 }
108
109 static force_inline void
110 unpack_565_128_4x128 (__m128i  data,
111                       __m128i* data0,
112                       __m128i* data1,
113                       __m128i* data2,
114                       __m128i* data3)
115 {
116     __m128i lo, hi;
117
118     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
119     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
120
121     lo = unpack_565_to_8888 (lo);
122     hi = unpack_565_to_8888 (hi);
123
124     unpack_128_2x128 (lo, data0, data1);
125     unpack_128_2x128 (hi, data2, data3);
126 }
127
128 static force_inline uint16_t
129 pack_565_32_16 (uint32_t pixel)
130 {
131     return (uint16_t) (((pixel >> 8) & 0xf800) |
132                        ((pixel >> 5) & 0x07e0) |
133                        ((pixel >> 3) & 0x001f));
134 }
135
136 static force_inline __m128i
137 pack_2x128_128 (__m128i lo, __m128i hi)
138 {
139     return _mm_packus_epi16 (lo, hi);
140 }
141
142 static force_inline __m128i
143 pack_565_2x128_128 (__m128i lo, __m128i hi)
144 {
145     __m128i data;
146     __m128i r, g1, g2, b;
147
148     data = pack_2x128_128 (lo, hi);
149
150     r  = _mm_and_si128 (data, mask_565_r);
151     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
152     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
153     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
154
155     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
156 }
157
158 static force_inline __m128i
159 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
160 {
161     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
162                              pack_565_2x128_128 (*xmm2, *xmm3));
163 }
164
165 static force_inline int
166 is_opaque (__m128i x)
167 {
168     __m128i ffs = _mm_cmpeq_epi8 (x, x);
169
170     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
171 }
172
173 static force_inline int
174 is_zero (__m128i x)
175 {
176     return _mm_movemask_epi8 (
177         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
178 }
179
180 static force_inline int
181 is_transparent (__m128i x)
182 {
183     return (_mm_movemask_epi8 (
184                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
185 }
186
187 static force_inline __m128i
188 expand_pixel_32_1x128 (uint32_t data)
189 {
190     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
191 }
192
193 static force_inline __m128i
194 expand_alpha_1x128 (__m128i data)
195 {
196     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
197                                                      _MM_SHUFFLE (3, 3, 3, 3)),
198                                 _MM_SHUFFLE (3, 3, 3, 3));
199 }
200
201 static force_inline void
202 expand_alpha_2x128 (__m128i  data_lo,
203                     __m128i  data_hi,
204                     __m128i* alpha_lo,
205                     __m128i* alpha_hi)
206 {
207     __m128i lo, hi;
208
209     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
210     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
211
212     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
213     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
214 }
215
216 static force_inline void
217 expand_alpha_rev_2x128 (__m128i  data_lo,
218                         __m128i  data_hi,
219                         __m128i* alpha_lo,
220                         __m128i* alpha_hi)
221 {
222     __m128i lo, hi;
223
224     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
225     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
226     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
227     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
228 }
229
230 static force_inline void
231 pix_multiply_2x128 (__m128i* data_lo,
232                     __m128i* data_hi,
233                     __m128i* alpha_lo,
234                     __m128i* alpha_hi,
235                     __m128i* ret_lo,
236                     __m128i* ret_hi)
237 {
238     __m128i lo, hi;
239
240     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
241     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
242     lo = _mm_adds_epu16 (lo, mask_0080);
243     hi = _mm_adds_epu16 (hi, mask_0080);
244     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
245     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
246 }
247
248 static force_inline void
249 pix_add_multiply_2x128 (__m128i* src_lo,
250                         __m128i* src_hi,
251                         __m128i* alpha_dst_lo,
252                         __m128i* alpha_dst_hi,
253                         __m128i* dst_lo,
254                         __m128i* dst_hi,
255                         __m128i* alpha_src_lo,
256                         __m128i* alpha_src_hi,
257                         __m128i* ret_lo,
258                         __m128i* ret_hi)
259 {
260     __m128i t1_lo, t1_hi;
261     __m128i t2_lo, t2_hi;
262
263     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
264     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
265
266     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
267     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
268 }
269
270 static force_inline void
271 negate_2x128 (__m128i  data_lo,
272               __m128i  data_hi,
273               __m128i* neg_lo,
274               __m128i* neg_hi)
275 {
276     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
277     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
278 }
279
280 static force_inline void
281 invert_colors_2x128 (__m128i  data_lo,
282                      __m128i  data_hi,
283                      __m128i* inv_lo,
284                      __m128i* inv_hi)
285 {
286     __m128i lo, hi;
287
288     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
289     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
290     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
291     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
292 }
293
294 static force_inline void
295 over_2x128 (__m128i* src_lo,
296             __m128i* src_hi,
297             __m128i* alpha_lo,
298             __m128i* alpha_hi,
299             __m128i* dst_lo,
300             __m128i* dst_hi)
301 {
302     __m128i t1, t2;
303
304     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
305
306     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
307
308     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
309     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
310 }
311
312 static force_inline void
313 over_rev_non_pre_2x128 (__m128i  src_lo,
314                         __m128i  src_hi,
315                         __m128i* dst_lo,
316                         __m128i* dst_hi)
317 {
318     __m128i lo, hi;
319     __m128i alpha_lo, alpha_hi;
320
321     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
322
323     lo = _mm_or_si128 (alpha_lo, mask_alpha);
324     hi = _mm_or_si128 (alpha_hi, mask_alpha);
325
326     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
327
328     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
329
330     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
331 }
332
333 static force_inline void
334 in_over_2x128 (__m128i* src_lo,
335                __m128i* src_hi,
336                __m128i* alpha_lo,
337                __m128i* alpha_hi,
338                __m128i* mask_lo,
339                __m128i* mask_hi,
340                __m128i* dst_lo,
341                __m128i* dst_hi)
342 {
343     __m128i s_lo, s_hi;
344     __m128i a_lo, a_hi;
345
346     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
347     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
348
349     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
350 }
351
352 /* load 4 pixels from a 16-byte boundary aligned address */
353 static force_inline __m128i
354 load_128_aligned (__m128i* src)
355 {
356     return _mm_load_si128 (src);
357 }
358
359 /* load 4 pixels from a unaligned address */
360 static force_inline __m128i
361 load_128_unaligned (const __m128i* src)
362 {
363     return _mm_loadu_si128 (src);
364 }
365
366 /* save 4 pixels using Write Combining memory on a 16-byte
367  * boundary aligned address
368  */
369 static force_inline void
370 save_128_write_combining (__m128i* dst,
371                           __m128i  data)
372 {
373     _mm_stream_si128 (dst, data);
374 }
375
376 /* save 4 pixels on a 16-byte boundary aligned address */
377 static force_inline void
378 save_128_aligned (__m128i* dst,
379                   __m128i  data)
380 {
381     _mm_store_si128 (dst, data);
382 }
383
384 /* save 4 pixels on a unaligned address */
385 static force_inline void
386 save_128_unaligned (__m128i* dst,
387                     __m128i  data)
388 {
389     _mm_storeu_si128 (dst, data);
390 }
391
392 /* ------------------------------------------------------------------
393  * MMX inlines
394  */
395
396 static force_inline __m128i
397 load_32_1x128 (uint32_t data)
398 {
399     return _mm_cvtsi32_si128 (data);
400 }
401
402 static force_inline __m128i
403 expand_alpha_rev_1x128 (__m128i data)
404 {
405     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
406 }
407
408 static force_inline __m128i
409 expand_pixel_8_1x128 (uint8_t data)
410 {
411     return _mm_shufflelo_epi16 (
412         unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
413 }
414
415 static force_inline __m128i
416 pix_multiply_1x128 (__m128i data,
417                     __m128i alpha)
418 {
419     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
420                                             mask_0080),
421                             mask_0101);
422 }
423
424 static force_inline __m128i
425 pix_add_multiply_1x128 (__m128i* src,
426                         __m128i* alpha_dst,
427                         __m128i* dst,
428                         __m128i* alpha_src)
429 {
430     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
431     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
432
433     return _mm_adds_epu8 (t1, t2);
434 }
435
436 static force_inline __m128i
437 negate_1x128 (__m128i data)
438 {
439     return _mm_xor_si128 (data, mask_00ff);
440 }
441
442 static force_inline __m128i
443 invert_colors_1x128 (__m128i data)
444 {
445     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
446 }
447
448 static force_inline __m128i
449 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
450 {
451     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
452 }
453
454 static force_inline __m128i
455 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
456 {
457     return over_1x128 (pix_multiply_1x128 (*src, *mask),
458                        pix_multiply_1x128 (*alpha, *mask),
459                        *dst);
460 }
461
462 static force_inline __m128i
463 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
464 {
465     __m128i alpha = expand_alpha_1x128 (src);
466
467     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
468                                            _mm_or_si128 (alpha, mask_alpha)),
469                        alpha,
470                        dst);
471 }
472
473 static force_inline uint32_t
474 pack_1x128_32 (__m128i data)
475 {
476     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
477 }
478
479 static force_inline __m128i
480 expand565_16_1x128 (uint16_t pixel)
481 {
482     __m128i m = _mm_cvtsi32_si128 (pixel);
483
484     m = unpack_565_to_8888 (m);
485
486     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
487 }
488
489 /* ----------------------------------------------------------------------------
490  * Compose Core transformations
491  */
492 static force_inline uint32_t
493 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
494 {
495     uint8_t a;
496     __m128i xmms;
497
498     a = src >> 24;
499
500     if (a == 0xff)
501     {
502         return src;
503     }
504     else if (src)
505     {
506         xmms = unpack_32_1x128 (src);
507         return pack_1x128_32 (
508             over_1x128 (xmms, expand_alpha_1x128 (xmms),
509                         unpack_32_1x128 (dst)));
510     }
511
512     return dst;
513 }
514
515 static force_inline uint32_t
516 combine1 (const uint32_t *ps, const uint32_t *pm)
517 {
518     uint32_t s = *ps;
519
520     if (pm)
521     {
522         __m128i ms, mm;
523
524         mm = unpack_32_1x128 (*pm);
525         mm = expand_alpha_1x128 (mm);
526
527         ms = unpack_32_1x128 (s);
528         ms = pix_multiply_1x128 (ms, mm);
529
530         s = pack_1x128_32 (ms);
531     }
532
533     return s;
534 }
535
536 static force_inline __m128i
537 combine4 (const __m128i *ps, const __m128i *pm)
538 {
539     __m128i xmm_src_lo, xmm_src_hi;
540     __m128i xmm_msk_lo, xmm_msk_hi;
541     __m128i s;
542
543     if (pm)
544     {
545         xmm_msk_lo = load_128_unaligned (pm);
546
547         if (is_transparent (xmm_msk_lo))
548             return _mm_setzero_si128 ();
549     }
550
551     s = load_128_unaligned (ps);
552
553     if (pm)
554     {
555         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
556         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
557
558         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
559
560         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
561                             &xmm_msk_lo, &xmm_msk_hi,
562                             &xmm_src_lo, &xmm_src_hi);
563
564         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
565     }
566
567     return s;
568 }
569
570 static force_inline void
571 core_combine_over_u_sse2_mask (uint32_t *         pd,
572                                const uint32_t*    ps,
573                                const uint32_t*    pm,
574                                int                w)
575 {
576     uint32_t s, d;
577
578     /* Align dst on a 16-byte boundary */
579     while (w && ((unsigned long)pd & 15))
580     {
581         d = *pd;
582         s = combine1 (ps, pm);
583
584         if (s)
585             *pd = core_combine_over_u_pixel_sse2 (s, d);
586         pd++;
587         ps++;
588         pm++;
589         w--;
590     }
591
592     while (w >= 4)
593     {
594         __m128i mask = load_128_unaligned ((__m128i *)pm);
595
596         if (!is_zero (mask))
597         {
598             __m128i src;
599             __m128i src_hi, src_lo;
600             __m128i mask_hi, mask_lo;
601             __m128i alpha_hi, alpha_lo;
602
603             src = load_128_unaligned ((__m128i *)ps);
604
605             if (is_opaque (_mm_and_si128 (src, mask)))
606             {
607                 save_128_aligned ((__m128i *)pd, src);
608             }
609             else
610             {
611                 __m128i dst = load_128_aligned ((__m128i *)pd);
612                 __m128i dst_hi, dst_lo;
613
614                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
615                 unpack_128_2x128 (src, &src_lo, &src_hi);
616
617                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
618                 pix_multiply_2x128 (&src_lo, &src_hi,
619                                     &mask_lo, &mask_hi,
620                                     &src_lo, &src_hi);
621
622                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
623
624                 expand_alpha_2x128 (src_lo, src_hi,
625                                     &alpha_lo, &alpha_hi);
626
627                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
628                             &dst_lo, &dst_hi);
629
630                 save_128_aligned (
631                     (__m128i *)pd,
632                     pack_2x128_128 (dst_lo, dst_hi));
633             }
634         }
635
636         pm += 4;
637         ps += 4;
638         pd += 4;
639         w -= 4;
640     }
641     while (w)
642     {
643         d = *pd;
644         s = combine1 (ps, pm);
645
646         if (s)
647             *pd = core_combine_over_u_pixel_sse2 (s, d);
648         pd++;
649         ps++;
650         pm++;
651
652         w--;
653     }
654 }
655
656 static force_inline void
657 core_combine_over_u_sse2_no_mask (uint32_t *      pd,
658                                   const uint32_t*    ps,
659                                   int                w)
660 {
661     uint32_t s, d;
662
663     /* Align dst on a 16-byte boundary */
664     while (w && ((unsigned long)pd & 15))
665     {
666         d = *pd;
667         s = *ps;
668
669         if (s)
670             *pd = core_combine_over_u_pixel_sse2 (s, d);
671         pd++;
672         ps++;
673         w--;
674     }
675
676     while (w >= 4)
677     {
678         __m128i src;
679         __m128i src_hi, src_lo, dst_hi, dst_lo;
680         __m128i alpha_hi, alpha_lo;
681
682         src = load_128_unaligned ((__m128i *)ps);
683
684         if (!is_zero (src))
685         {
686             if (is_opaque (src))
687             {
688                 save_128_aligned ((__m128i *)pd, src);
689             }
690             else
691             {
692                 __m128i dst = load_128_aligned ((__m128i *)pd);
693
694                 unpack_128_2x128 (src, &src_lo, &src_hi);
695                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
696
697                 expand_alpha_2x128 (src_lo, src_hi,
698                                     &alpha_lo, &alpha_hi);
699                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
700                             &dst_lo, &dst_hi);
701
702                 save_128_aligned (
703                     (__m128i *)pd,
704                     pack_2x128_128 (dst_lo, dst_hi));
705             }
706         }
707
708         ps += 4;
709         pd += 4;
710         w -= 4;
711     }
712     while (w)
713     {
714         d = *pd;
715         s = *ps;
716
717         if (s)
718             *pd = core_combine_over_u_pixel_sse2 (s, d);
719         pd++;
720         ps++;
721
722         w--;
723     }
724 }
725
726 static force_inline void
727 sse2_combine_over_u (pixman_implementation_t *imp,
728                      pixman_op_t              op,
729                      uint32_t *               pd,
730                      const uint32_t *         ps,
731                      const uint32_t *         pm,
732                      int                      w)
733 {
734     if (pm)
735         core_combine_over_u_sse2_mask (pd, ps, pm, w);
736     else
737         core_combine_over_u_sse2_no_mask (pd, ps, w);
738 }
739
740 static void
741 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
742                              pixman_op_t              op,
743                              uint32_t *               pd,
744                              const uint32_t *         ps,
745                              const uint32_t *         pm,
746                              int                      w)
747 {
748     uint32_t s, d;
749
750     __m128i xmm_dst_lo, xmm_dst_hi;
751     __m128i xmm_src_lo, xmm_src_hi;
752     __m128i xmm_alpha_lo, xmm_alpha_hi;
753
754     /* Align dst on a 16-byte boundary */
755     while (w &&
756            ((unsigned long)pd & 15))
757     {
758         d = *pd;
759         s = combine1 (ps, pm);
760
761         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
762         w--;
763         ps++;
764         if (pm)
765             pm++;
766     }
767
768     while (w >= 4)
769     {
770         /* I'm loading unaligned because I'm not sure
771          * about the address alignment.
772          */
773         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
774         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
775
776         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
777         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
778
779         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
780                             &xmm_alpha_lo, &xmm_alpha_hi);
781
782         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
783                     &xmm_alpha_lo, &xmm_alpha_hi,
784                     &xmm_src_lo, &xmm_src_hi);
785
786         /* rebuid the 4 pixel data and save*/
787         save_128_aligned ((__m128i*)pd,
788                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
789
790         w -= 4;
791         ps += 4;
792         pd += 4;
793
794         if (pm)
795             pm += 4;
796     }
797
798     while (w)
799     {
800         d = *pd;
801         s = combine1 (ps, pm);
802
803         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
804         ps++;
805         w--;
806         if (pm)
807             pm++;
808     }
809 }
810
811 static force_inline uint32_t
812 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
813 {
814     uint32_t maska = src >> 24;
815
816     if (maska == 0)
817     {
818         return 0;
819     }
820     else if (maska != 0xff)
821     {
822         return pack_1x128_32 (
823             pix_multiply_1x128 (unpack_32_1x128 (dst),
824                                 expand_alpha_1x128 (unpack_32_1x128 (src))));
825     }
826
827     return dst;
828 }
829
830 static void
831 sse2_combine_in_u (pixman_implementation_t *imp,
832                    pixman_op_t              op,
833                    uint32_t *               pd,
834                    const uint32_t *         ps,
835                    const uint32_t *         pm,
836                    int                      w)
837 {
838     uint32_t s, d;
839
840     __m128i xmm_src_lo, xmm_src_hi;
841     __m128i xmm_dst_lo, xmm_dst_hi;
842
843     while (w && ((unsigned long) pd & 15))
844     {
845         s = combine1 (ps, pm);
846         d = *pd;
847
848         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
849         w--;
850         ps++;
851         if (pm)
852             pm++;
853     }
854
855     while (w >= 4)
856     {
857         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
858         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
859
860         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
861         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
862
863         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
865                             &xmm_dst_lo, &xmm_dst_hi,
866                             &xmm_dst_lo, &xmm_dst_hi);
867
868         save_128_aligned ((__m128i*)pd,
869                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
870
871         ps += 4;
872         pd += 4;
873         w -= 4;
874         if (pm)
875             pm += 4;
876     }
877
878     while (w)
879     {
880         s = combine1 (ps, pm);
881         d = *pd;
882
883         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
884         w--;
885         ps++;
886         if (pm)
887             pm++;
888     }
889 }
890
891 static void
892 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
893                            pixman_op_t              op,
894                            uint32_t *               pd,
895                            const uint32_t *         ps,
896                            const uint32_t *         pm,
897                            int                      w)
898 {
899     uint32_t s, d;
900
901     __m128i xmm_src_lo, xmm_src_hi;
902     __m128i xmm_dst_lo, xmm_dst_hi;
903
904     while (w && ((unsigned long) pd & 15))
905     {
906         s = combine1 (ps, pm);
907         d = *pd;
908
909         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
910         ps++;
911         w--;
912         if (pm)
913             pm++;
914     }
915
916     while (w >= 4)
917     {
918         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
919         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
920
921         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
922         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
923
924         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
925         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
926                             &xmm_src_lo, &xmm_src_hi,
927                             &xmm_dst_lo, &xmm_dst_hi);
928
929         save_128_aligned (
930             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
931
932         ps += 4;
933         pd += 4;
934         w -= 4;
935         if (pm)
936             pm += 4;
937     }
938
939     while (w)
940     {
941         s = combine1 (ps, pm);
942         d = *pd;
943
944         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
945         w--;
946         ps++;
947         if (pm)
948             pm++;
949     }
950 }
951
952 static void
953 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
954                             pixman_op_t              op,
955                             uint32_t *               pd,
956                             const uint32_t *         ps,
957                             const uint32_t *         pm,
958                             int                      w)
959 {
960     while (w && ((unsigned long) pd & 15))
961     {
962         uint32_t s = combine1 (ps, pm);
963         uint32_t d = *pd;
964
965         *pd++ = pack_1x128_32 (
966             pix_multiply_1x128 (
967                 unpack_32_1x128 (d), negate_1x128 (
968                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
969
970         if (pm)
971             pm++;
972         ps++;
973         w--;
974     }
975
976     while (w >= 4)
977     {
978         __m128i xmm_src_lo, xmm_src_hi;
979         __m128i xmm_dst_lo, xmm_dst_hi;
980
981         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
982         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
983
984         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
985         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
986
987         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
988         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
989
990         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
991                             &xmm_src_lo, &xmm_src_hi,
992                             &xmm_dst_lo, &xmm_dst_hi);
993
994         save_128_aligned (
995             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
996
997         ps += 4;
998         pd += 4;
999         if (pm)
1000             pm += 4;
1001
1002         w -= 4;
1003     }
1004
1005     while (w)
1006     {
1007         uint32_t s = combine1 (ps, pm);
1008         uint32_t d = *pd;
1009
1010         *pd++ = pack_1x128_32 (
1011             pix_multiply_1x128 (
1012                 unpack_32_1x128 (d), negate_1x128 (
1013                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
1014         ps++;
1015         if (pm)
1016             pm++;
1017         w--;
1018     }
1019 }
1020
1021 static void
1022 sse2_combine_out_u (pixman_implementation_t *imp,
1023                     pixman_op_t              op,
1024                     uint32_t *               pd,
1025                     const uint32_t *         ps,
1026                     const uint32_t *         pm,
1027                     int                      w)
1028 {
1029     while (w && ((unsigned long) pd & 15))
1030     {
1031         uint32_t s = combine1 (ps, pm);
1032         uint32_t d = *pd;
1033
1034         *pd++ = pack_1x128_32 (
1035             pix_multiply_1x128 (
1036                 unpack_32_1x128 (s), negate_1x128 (
1037                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1038         w--;
1039         ps++;
1040         if (pm)
1041             pm++;
1042     }
1043
1044     while (w >= 4)
1045     {
1046         __m128i xmm_src_lo, xmm_src_hi;
1047         __m128i xmm_dst_lo, xmm_dst_hi;
1048
1049         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1050         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1051
1052         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1053         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1054
1055         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1056         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1057
1058         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1059                             &xmm_dst_lo, &xmm_dst_hi,
1060                             &xmm_dst_lo, &xmm_dst_hi);
1061
1062         save_128_aligned (
1063             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1064
1065         ps += 4;
1066         pd += 4;
1067         w -= 4;
1068         if (pm)
1069             pm += 4;
1070     }
1071
1072     while (w)
1073     {
1074         uint32_t s = combine1 (ps, pm);
1075         uint32_t d = *pd;
1076
1077         *pd++ = pack_1x128_32 (
1078             pix_multiply_1x128 (
1079                 unpack_32_1x128 (s), negate_1x128 (
1080                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1081         w--;
1082         ps++;
1083         if (pm)
1084             pm++;
1085     }
1086 }
1087
1088 static force_inline uint32_t
1089 core_combine_atop_u_pixel_sse2 (uint32_t src,
1090                                 uint32_t dst)
1091 {
1092     __m128i s = unpack_32_1x128 (src);
1093     __m128i d = unpack_32_1x128 (dst);
1094
1095     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1096     __m128i da = expand_alpha_1x128 (d);
1097
1098     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1099 }
1100
1101 static void
1102 sse2_combine_atop_u (pixman_implementation_t *imp,
1103                      pixman_op_t              op,
1104                      uint32_t *               pd,
1105                      const uint32_t *         ps,
1106                      const uint32_t *         pm,
1107                      int                      w)
1108 {
1109     uint32_t s, d;
1110
1111     __m128i xmm_src_lo, xmm_src_hi;
1112     __m128i xmm_dst_lo, xmm_dst_hi;
1113     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1114     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1115
1116     while (w && ((unsigned long) pd & 15))
1117     {
1118         s = combine1 (ps, pm);
1119         d = *pd;
1120
1121         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1122         w--;
1123         ps++;
1124         if (pm)
1125             pm++;
1126     }
1127
1128     while (w >= 4)
1129     {
1130         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1131         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1132
1133         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1134         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1135
1136         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1137                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1138         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1139                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1140
1141         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1142                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1143
1144         pix_add_multiply_2x128 (
1145             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1146             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1147             &xmm_dst_lo, &xmm_dst_hi);
1148
1149         save_128_aligned (
1150             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1151
1152         ps += 4;
1153         pd += 4;
1154         w -= 4;
1155         if (pm)
1156             pm += 4;
1157     }
1158
1159     while (w)
1160     {
1161         s = combine1 (ps, pm);
1162         d = *pd;
1163
1164         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1165         w--;
1166         ps++;
1167         if (pm)
1168             pm++;
1169     }
1170 }
1171
1172 static force_inline uint32_t
1173 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1174                                         uint32_t dst)
1175 {
1176     __m128i s = unpack_32_1x128 (src);
1177     __m128i d = unpack_32_1x128 (dst);
1178
1179     __m128i sa = expand_alpha_1x128 (s);
1180     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1181
1182     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1183 }
1184
1185 static void
1186 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1187                              pixman_op_t              op,
1188                              uint32_t *               pd,
1189                              const uint32_t *         ps,
1190                              const uint32_t *         pm,
1191                              int                      w)
1192 {
1193     uint32_t s, d;
1194
1195     __m128i xmm_src_lo, xmm_src_hi;
1196     __m128i xmm_dst_lo, xmm_dst_hi;
1197     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1198     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1199
1200     while (w && ((unsigned long) pd & 15))
1201     {
1202         s = combine1 (ps, pm);
1203         d = *pd;
1204
1205         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1206         ps++;
1207         w--;
1208         if (pm)
1209             pm++;
1210     }
1211
1212     while (w >= 4)
1213     {
1214         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1215         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1216
1217         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1218         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1219
1220         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1221                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1222         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1223                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1224
1225         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1226                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1227
1228         pix_add_multiply_2x128 (
1229             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1230             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1231             &xmm_dst_lo, &xmm_dst_hi);
1232
1233         save_128_aligned (
1234             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1235
1236         ps += 4;
1237         pd += 4;
1238         w -= 4;
1239         if (pm)
1240             pm += 4;
1241     }
1242
1243     while (w)
1244     {
1245         s = combine1 (ps, pm);
1246         d = *pd;
1247
1248         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1249         ps++;
1250         w--;
1251         if (pm)
1252             pm++;
1253     }
1254 }
1255
1256 static force_inline uint32_t
1257 core_combine_xor_u_pixel_sse2 (uint32_t src,
1258                                uint32_t dst)
1259 {
1260     __m128i s = unpack_32_1x128 (src);
1261     __m128i d = unpack_32_1x128 (dst);
1262
1263     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1264     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1265
1266     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1267 }
1268
1269 static void
1270 sse2_combine_xor_u (pixman_implementation_t *imp,
1271                     pixman_op_t              op,
1272                     uint32_t *               dst,
1273                     const uint32_t *         src,
1274                     const uint32_t *         mask,
1275                     int                      width)
1276 {
1277     int w = width;
1278     uint32_t s, d;
1279     uint32_t* pd = dst;
1280     const uint32_t* ps = src;
1281     const uint32_t* pm = mask;
1282
1283     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1284     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1285     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1286     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1287
1288     while (w && ((unsigned long) pd & 15))
1289     {
1290         s = combine1 (ps, pm);
1291         d = *pd;
1292
1293         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1294         w--;
1295         ps++;
1296         if (pm)
1297             pm++;
1298     }
1299
1300     while (w >= 4)
1301     {
1302         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1303         xmm_dst = load_128_aligned ((__m128i*) pd);
1304
1305         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1306         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1307
1308         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1309                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1310         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1311                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1312
1313         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1314                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1315         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1316                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1317
1318         pix_add_multiply_2x128 (
1319             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1320             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1321             &xmm_dst_lo, &xmm_dst_hi);
1322
1323         save_128_aligned (
1324             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1325
1326         ps += 4;
1327         pd += 4;
1328         w -= 4;
1329         if (pm)
1330             pm += 4;
1331     }
1332
1333     while (w)
1334     {
1335         s = combine1 (ps, pm);
1336         d = *pd;
1337
1338         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1339         w--;
1340         ps++;
1341         if (pm)
1342             pm++;
1343     }
1344 }
1345
1346 static force_inline void
1347 sse2_combine_add_u (pixman_implementation_t *imp,
1348                     pixman_op_t              op,
1349                     uint32_t *               dst,
1350                     const uint32_t *         src,
1351                     const uint32_t *         mask,
1352                     int                      width)
1353 {
1354     int w = width;
1355     uint32_t s, d;
1356     uint32_t* pd = dst;
1357     const uint32_t* ps = src;
1358     const uint32_t* pm = mask;
1359
1360     while (w && (unsigned long)pd & 15)
1361     {
1362         s = combine1 (ps, pm);
1363         d = *pd;
1364
1365         ps++;
1366         if (pm)
1367             pm++;
1368         *pd++ = _mm_cvtsi128_si32 (
1369             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1370         w--;
1371     }
1372
1373     while (w >= 4)
1374     {
1375         __m128i s;
1376
1377         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1378
1379         save_128_aligned (
1380             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1381
1382         pd += 4;
1383         ps += 4;
1384         if (pm)
1385             pm += 4;
1386         w -= 4;
1387     }
1388
1389     while (w--)
1390     {
1391         s = combine1 (ps, pm);
1392         d = *pd;
1393
1394         ps++;
1395         *pd++ = _mm_cvtsi128_si32 (
1396             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1397         if (pm)
1398             pm++;
1399     }
1400 }
1401
1402 static force_inline uint32_t
1403 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1404                                     uint32_t dst)
1405 {
1406     __m128i ms = unpack_32_1x128 (src);
1407     __m128i md = unpack_32_1x128 (dst);
1408     uint32_t sa = src >> 24;
1409     uint32_t da = ~dst >> 24;
1410
1411     if (sa > da)
1412     {
1413         ms = pix_multiply_1x128 (
1414             ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1415     }
1416
1417     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1418 }
1419
1420 static void
1421 sse2_combine_saturate_u (pixman_implementation_t *imp,
1422                          pixman_op_t              op,
1423                          uint32_t *               pd,
1424                          const uint32_t *         ps,
1425                          const uint32_t *         pm,
1426                          int                      w)
1427 {
1428     uint32_t s, d;
1429
1430     uint32_t pack_cmp;
1431     __m128i xmm_src, xmm_dst;
1432
1433     while (w && (unsigned long)pd & 15)
1434     {
1435         s = combine1 (ps, pm);
1436         d = *pd;
1437
1438         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1439         w--;
1440         ps++;
1441         if (pm)
1442             pm++;
1443     }
1444
1445     while (w >= 4)
1446     {
1447         xmm_dst = load_128_aligned  ((__m128i*)pd);
1448         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1449
1450         pack_cmp = _mm_movemask_epi8 (
1451             _mm_cmpgt_epi32 (
1452                 _mm_srli_epi32 (xmm_src, 24),
1453                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1454
1455         /* if some alpha src is grater than respective ~alpha dst */
1456         if (pack_cmp)
1457         {
1458             s = combine1 (ps++, pm);
1459             d = *pd;
1460             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1461             if (pm)
1462                 pm++;
1463
1464             s = combine1 (ps++, pm);
1465             d = *pd;
1466             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1467             if (pm)
1468                 pm++;
1469
1470             s = combine1 (ps++, pm);
1471             d = *pd;
1472             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1473             if (pm)
1474                 pm++;
1475
1476             s = combine1 (ps++, pm);
1477             d = *pd;
1478             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1479             if (pm)
1480                 pm++;
1481         }
1482         else
1483         {
1484             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1485
1486             pd += 4;
1487             ps += 4;
1488             if (pm)
1489                 pm += 4;
1490         }
1491
1492         w -= 4;
1493     }
1494
1495     while (w--)
1496     {
1497         s = combine1 (ps, pm);
1498         d = *pd;
1499
1500         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1501         ps++;
1502         if (pm)
1503             pm++;
1504     }
1505 }
1506
1507 static void
1508 sse2_combine_src_ca (pixman_implementation_t *imp,
1509                      pixman_op_t              op,
1510                      uint32_t *               pd,
1511                      const uint32_t *         ps,
1512                      const uint32_t *         pm,
1513                      int                      w)
1514 {
1515     uint32_t s, m;
1516
1517     __m128i xmm_src_lo, xmm_src_hi;
1518     __m128i xmm_mask_lo, xmm_mask_hi;
1519     __m128i xmm_dst_lo, xmm_dst_hi;
1520
1521     while (w && (unsigned long)pd & 15)
1522     {
1523         s = *ps++;
1524         m = *pm++;
1525         *pd++ = pack_1x128_32 (
1526             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1527         w--;
1528     }
1529
1530     while (w >= 4)
1531     {
1532         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1533         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1534
1535         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1536         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1537
1538         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1539                             &xmm_mask_lo, &xmm_mask_hi,
1540                             &xmm_dst_lo, &xmm_dst_hi);
1541
1542         save_128_aligned (
1543             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1544
1545         ps += 4;
1546         pd += 4;
1547         pm += 4;
1548         w -= 4;
1549     }
1550
1551     while (w)
1552     {
1553         s = *ps++;
1554         m = *pm++;
1555         *pd++ = pack_1x128_32 (
1556             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1557         w--;
1558     }
1559 }
1560
1561 static force_inline uint32_t
1562 core_combine_over_ca_pixel_sse2 (uint32_t src,
1563                                  uint32_t mask,
1564                                  uint32_t dst)
1565 {
1566     __m128i s = unpack_32_1x128 (src);
1567     __m128i expAlpha = expand_alpha_1x128 (s);
1568     __m128i unpk_mask = unpack_32_1x128 (mask);
1569     __m128i unpk_dst  = unpack_32_1x128 (dst);
1570
1571     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1572 }
1573
1574 static void
1575 sse2_combine_over_ca (pixman_implementation_t *imp,
1576                       pixman_op_t              op,
1577                       uint32_t *               pd,
1578                       const uint32_t *         ps,
1579                       const uint32_t *         pm,
1580                       int                      w)
1581 {
1582     uint32_t s, m, d;
1583
1584     __m128i xmm_alpha_lo, xmm_alpha_hi;
1585     __m128i xmm_src_lo, xmm_src_hi;
1586     __m128i xmm_dst_lo, xmm_dst_hi;
1587     __m128i xmm_mask_lo, xmm_mask_hi;
1588
1589     while (w && (unsigned long)pd & 15)
1590     {
1591         s = *ps++;
1592         m = *pm++;
1593         d = *pd;
1594
1595         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1596         w--;
1597     }
1598
1599     while (w >= 4)
1600     {
1601         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1602         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1603         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1604
1605         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1606         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1607         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1608
1609         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1610                             &xmm_alpha_lo, &xmm_alpha_hi);
1611
1612         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1613                        &xmm_alpha_lo, &xmm_alpha_hi,
1614                        &xmm_mask_lo, &xmm_mask_hi,
1615                        &xmm_dst_lo, &xmm_dst_hi);
1616
1617         save_128_aligned (
1618             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1619
1620         ps += 4;
1621         pd += 4;
1622         pm += 4;
1623         w -= 4;
1624     }
1625
1626     while (w)
1627     {
1628         s = *ps++;
1629         m = *pm++;
1630         d = *pd;
1631
1632         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1633         w--;
1634     }
1635 }
1636
1637 static force_inline uint32_t
1638 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1639                                          uint32_t mask,
1640                                          uint32_t dst)
1641 {
1642     __m128i d = unpack_32_1x128 (dst);
1643
1644     return pack_1x128_32 (
1645         over_1x128 (d, expand_alpha_1x128 (d),
1646                     pix_multiply_1x128 (unpack_32_1x128 (src),
1647                                         unpack_32_1x128 (mask))));
1648 }
1649
1650 static void
1651 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1652                               pixman_op_t              op,
1653                               uint32_t *               pd,
1654                               const uint32_t *         ps,
1655                               const uint32_t *         pm,
1656                               int                      w)
1657 {
1658     uint32_t s, m, d;
1659
1660     __m128i xmm_alpha_lo, xmm_alpha_hi;
1661     __m128i xmm_src_lo, xmm_src_hi;
1662     __m128i xmm_dst_lo, xmm_dst_hi;
1663     __m128i xmm_mask_lo, xmm_mask_hi;
1664
1665     while (w && (unsigned long)pd & 15)
1666     {
1667         s = *ps++;
1668         m = *pm++;
1669         d = *pd;
1670
1671         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1672         w--;
1673     }
1674
1675     while (w >= 4)
1676     {
1677         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1678         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1679         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1680
1681         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1682         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1683         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1684
1685         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1686                             &xmm_alpha_lo, &xmm_alpha_hi);
1687         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1688                             &xmm_mask_lo, &xmm_mask_hi,
1689                             &xmm_mask_lo, &xmm_mask_hi);
1690
1691         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1692                     &xmm_alpha_lo, &xmm_alpha_hi,
1693                     &xmm_mask_lo, &xmm_mask_hi);
1694
1695         save_128_aligned (
1696             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1697
1698         ps += 4;
1699         pd += 4;
1700         pm += 4;
1701         w -= 4;
1702     }
1703
1704     while (w)
1705     {
1706         s = *ps++;
1707         m = *pm++;
1708         d = *pd;
1709
1710         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1711         w--;
1712     }
1713 }
1714
1715 static void
1716 sse2_combine_in_ca (pixman_implementation_t *imp,
1717                     pixman_op_t              op,
1718                     uint32_t *               pd,
1719                     const uint32_t *         ps,
1720                     const uint32_t *         pm,
1721                     int                      w)
1722 {
1723     uint32_t s, m, d;
1724
1725     __m128i xmm_alpha_lo, xmm_alpha_hi;
1726     __m128i xmm_src_lo, xmm_src_hi;
1727     __m128i xmm_dst_lo, xmm_dst_hi;
1728     __m128i xmm_mask_lo, xmm_mask_hi;
1729
1730     while (w && (unsigned long)pd & 15)
1731     {
1732         s = *ps++;
1733         m = *pm++;
1734         d = *pd;
1735
1736         *pd++ = pack_1x128_32 (
1737             pix_multiply_1x128 (
1738                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1739                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1740
1741         w--;
1742     }
1743
1744     while (w >= 4)
1745     {
1746         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1747         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1748         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1749
1750         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1751         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1752         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1753
1754         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1755                             &xmm_alpha_lo, &xmm_alpha_hi);
1756
1757         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1758                             &xmm_mask_lo, &xmm_mask_hi,
1759                             &xmm_dst_lo, &xmm_dst_hi);
1760
1761         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1762                             &xmm_alpha_lo, &xmm_alpha_hi,
1763                             &xmm_dst_lo, &xmm_dst_hi);
1764
1765         save_128_aligned (
1766             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1767
1768         ps += 4;
1769         pd += 4;
1770         pm += 4;
1771         w -= 4;
1772     }
1773
1774     while (w)
1775     {
1776         s = *ps++;
1777         m = *pm++;
1778         d = *pd;
1779
1780         *pd++ = pack_1x128_32 (
1781             pix_multiply_1x128 (
1782                 pix_multiply_1x128 (
1783                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1784                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1785
1786         w--;
1787     }
1788 }
1789
1790 static void
1791 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1792                             pixman_op_t              op,
1793                             uint32_t *               pd,
1794                             const uint32_t *         ps,
1795                             const uint32_t *         pm,
1796                             int                      w)
1797 {
1798     uint32_t s, m, d;
1799
1800     __m128i xmm_alpha_lo, xmm_alpha_hi;
1801     __m128i xmm_src_lo, xmm_src_hi;
1802     __m128i xmm_dst_lo, xmm_dst_hi;
1803     __m128i xmm_mask_lo, xmm_mask_hi;
1804
1805     while (w && (unsigned long)pd & 15)
1806     {
1807         s = *ps++;
1808         m = *pm++;
1809         d = *pd;
1810
1811         *pd++ = pack_1x128_32 (
1812             pix_multiply_1x128 (
1813                 unpack_32_1x128 (d),
1814                 pix_multiply_1x128 (unpack_32_1x128 (m),
1815                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1816         w--;
1817     }
1818
1819     while (w >= 4)
1820     {
1821         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1822         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1823         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1824
1825         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1826         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1827         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1828
1829         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1830                             &xmm_alpha_lo, &xmm_alpha_hi);
1831         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1832                             &xmm_alpha_lo, &xmm_alpha_hi,
1833                             &xmm_alpha_lo, &xmm_alpha_hi);
1834
1835         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1836                             &xmm_alpha_lo, &xmm_alpha_hi,
1837                             &xmm_dst_lo, &xmm_dst_hi);
1838
1839         save_128_aligned (
1840             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1841
1842         ps += 4;
1843         pd += 4;
1844         pm += 4;
1845         w -= 4;
1846     }
1847
1848     while (w)
1849     {
1850         s = *ps++;
1851         m = *pm++;
1852         d = *pd;
1853
1854         *pd++ = pack_1x128_32 (
1855             pix_multiply_1x128 (
1856                 unpack_32_1x128 (d),
1857                 pix_multiply_1x128 (unpack_32_1x128 (m),
1858                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1859         w--;
1860     }
1861 }
1862
1863 static void
1864 sse2_combine_out_ca (pixman_implementation_t *imp,
1865                      pixman_op_t              op,
1866                      uint32_t *               pd,
1867                      const uint32_t *         ps,
1868                      const uint32_t *         pm,
1869                      int                      w)
1870 {
1871     uint32_t s, m, d;
1872
1873     __m128i xmm_alpha_lo, xmm_alpha_hi;
1874     __m128i xmm_src_lo, xmm_src_hi;
1875     __m128i xmm_dst_lo, xmm_dst_hi;
1876     __m128i xmm_mask_lo, xmm_mask_hi;
1877
1878     while (w && (unsigned long)pd & 15)
1879     {
1880         s = *ps++;
1881         m = *pm++;
1882         d = *pd;
1883
1884         *pd++ = pack_1x128_32 (
1885             pix_multiply_1x128 (
1886                 pix_multiply_1x128 (
1887                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1888                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1889         w--;
1890     }
1891
1892     while (w >= 4)
1893     {
1894         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1895         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1896         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1897
1898         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1899         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1900         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1901
1902         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1903                             &xmm_alpha_lo, &xmm_alpha_hi);
1904         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1905                       &xmm_alpha_lo, &xmm_alpha_hi);
1906
1907         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1908                             &xmm_mask_lo, &xmm_mask_hi,
1909                             &xmm_dst_lo, &xmm_dst_hi);
1910         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1911                             &xmm_alpha_lo, &xmm_alpha_hi,
1912                             &xmm_dst_lo, &xmm_dst_hi);
1913
1914         save_128_aligned (
1915             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1916
1917         ps += 4;
1918         pd += 4;
1919         pm += 4;
1920         w -= 4;
1921     }
1922
1923     while (w)
1924     {
1925         s = *ps++;
1926         m = *pm++;
1927         d = *pd;
1928
1929         *pd++ = pack_1x128_32 (
1930             pix_multiply_1x128 (
1931                 pix_multiply_1x128 (
1932                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1933                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1934
1935         w--;
1936     }
1937 }
1938
1939 static void
1940 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1941                              pixman_op_t              op,
1942                              uint32_t *               pd,
1943                              const uint32_t *         ps,
1944                              const uint32_t *         pm,
1945                              int                      w)
1946 {
1947     uint32_t s, m, d;
1948
1949     __m128i xmm_alpha_lo, xmm_alpha_hi;
1950     __m128i xmm_src_lo, xmm_src_hi;
1951     __m128i xmm_dst_lo, xmm_dst_hi;
1952     __m128i xmm_mask_lo, xmm_mask_hi;
1953
1954     while (w && (unsigned long)pd & 15)
1955     {
1956         s = *ps++;
1957         m = *pm++;
1958         d = *pd;
1959
1960         *pd++ = pack_1x128_32 (
1961             pix_multiply_1x128 (
1962                 unpack_32_1x128 (d),
1963                 negate_1x128 (pix_multiply_1x128 (
1964                                  unpack_32_1x128 (m),
1965                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1966         w--;
1967     }
1968
1969     while (w >= 4)
1970     {
1971         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1972         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1973         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1974
1975         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1976         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1977         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1978
1979         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1980                             &xmm_alpha_lo, &xmm_alpha_hi);
1981
1982         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1983                             &xmm_alpha_lo, &xmm_alpha_hi,
1984                             &xmm_mask_lo, &xmm_mask_hi);
1985
1986         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1987                       &xmm_mask_lo, &xmm_mask_hi);
1988
1989         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1990                             &xmm_mask_lo, &xmm_mask_hi,
1991                             &xmm_dst_lo, &xmm_dst_hi);
1992
1993         save_128_aligned (
1994             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1995
1996         ps += 4;
1997         pd += 4;
1998         pm += 4;
1999         w -= 4;
2000     }
2001
2002     while (w)
2003     {
2004         s = *ps++;
2005         m = *pm++;
2006         d = *pd;
2007
2008         *pd++ = pack_1x128_32 (
2009             pix_multiply_1x128 (
2010                 unpack_32_1x128 (d),
2011                 negate_1x128 (pix_multiply_1x128 (
2012                                  unpack_32_1x128 (m),
2013                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
2014         w--;
2015     }
2016 }
2017
2018 static force_inline uint32_t
2019 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2020                                  uint32_t mask,
2021                                  uint32_t dst)
2022 {
2023     __m128i m = unpack_32_1x128 (mask);
2024     __m128i s = unpack_32_1x128 (src);
2025     __m128i d = unpack_32_1x128 (dst);
2026     __m128i sa = expand_alpha_1x128 (s);
2027     __m128i da = expand_alpha_1x128 (d);
2028
2029     s = pix_multiply_1x128 (s, m);
2030     m = negate_1x128 (pix_multiply_1x128 (m, sa));
2031
2032     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2033 }
2034
2035 static void
2036 sse2_combine_atop_ca (pixman_implementation_t *imp,
2037                       pixman_op_t              op,
2038                       uint32_t *               pd,
2039                       const uint32_t *         ps,
2040                       const uint32_t *         pm,
2041                       int                      w)
2042 {
2043     uint32_t s, m, d;
2044
2045     __m128i xmm_src_lo, xmm_src_hi;
2046     __m128i xmm_dst_lo, xmm_dst_hi;
2047     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2048     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2049     __m128i xmm_mask_lo, xmm_mask_hi;
2050
2051     while (w && (unsigned long)pd & 15)
2052     {
2053         s = *ps++;
2054         m = *pm++;
2055         d = *pd;
2056
2057         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2058         w--;
2059     }
2060
2061     while (w >= 4)
2062     {
2063         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2064         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2065         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2066
2067         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2068         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2069         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2070
2071         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2072                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2073         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2074                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2075
2076         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2077                             &xmm_mask_lo, &xmm_mask_hi,
2078                             &xmm_src_lo, &xmm_src_hi);
2079         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2080                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2081                             &xmm_mask_lo, &xmm_mask_hi);
2082
2083         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2084
2085         pix_add_multiply_2x128 (
2086             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2087             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2088             &xmm_dst_lo, &xmm_dst_hi);
2089
2090         save_128_aligned (
2091             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2092
2093         ps += 4;
2094         pd += 4;
2095         pm += 4;
2096         w -= 4;
2097     }
2098
2099     while (w)
2100     {
2101         s = *ps++;
2102         m = *pm++;
2103         d = *pd;
2104
2105         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2106         w--;
2107     }
2108 }
2109
2110 static force_inline uint32_t
2111 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2112                                          uint32_t mask,
2113                                          uint32_t dst)
2114 {
2115     __m128i m = unpack_32_1x128 (mask);
2116     __m128i s = unpack_32_1x128 (src);
2117     __m128i d = unpack_32_1x128 (dst);
2118
2119     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2120     __m128i sa = expand_alpha_1x128 (s);
2121
2122     s = pix_multiply_1x128 (s, m);
2123     m = pix_multiply_1x128 (m, sa);
2124
2125     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2126 }
2127
2128 static void
2129 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2130                               pixman_op_t              op,
2131                               uint32_t *               pd,
2132                               const uint32_t *         ps,
2133                               const uint32_t *         pm,
2134                               int                      w)
2135 {
2136     uint32_t s, m, d;
2137
2138     __m128i xmm_src_lo, xmm_src_hi;
2139     __m128i xmm_dst_lo, xmm_dst_hi;
2140     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2141     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2142     __m128i xmm_mask_lo, xmm_mask_hi;
2143
2144     while (w && (unsigned long)pd & 15)
2145     {
2146         s = *ps++;
2147         m = *pm++;
2148         d = *pd;
2149
2150         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2151         w--;
2152     }
2153
2154     while (w >= 4)
2155     {
2156         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2157         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2158         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2159
2160         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2161         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2162         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2163
2164         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2165                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2166         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2167                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2168
2169         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2170                             &xmm_mask_lo, &xmm_mask_hi,
2171                             &xmm_src_lo, &xmm_src_hi);
2172         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2173                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2174                             &xmm_mask_lo, &xmm_mask_hi);
2175
2176         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2177                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2178
2179         pix_add_multiply_2x128 (
2180             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2181             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2182             &xmm_dst_lo, &xmm_dst_hi);
2183
2184         save_128_aligned (
2185             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2186
2187         ps += 4;
2188         pd += 4;
2189         pm += 4;
2190         w -= 4;
2191     }
2192
2193     while (w)
2194     {
2195         s = *ps++;
2196         m = *pm++;
2197         d = *pd;
2198
2199         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2200         w--;
2201     }
2202 }
2203
2204 static force_inline uint32_t
2205 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2206                                 uint32_t mask,
2207                                 uint32_t dst)
2208 {
2209     __m128i a = unpack_32_1x128 (mask);
2210     __m128i s = unpack_32_1x128 (src);
2211     __m128i d = unpack_32_1x128 (dst);
2212
2213     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2214                                        a, expand_alpha_1x128 (s)));
2215     __m128i dest      = pix_multiply_1x128 (s, a);
2216     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2217
2218     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2219                                                 &alpha_dst,
2220                                                 &dest,
2221                                                 &alpha_src));
2222 }
2223
2224 static void
2225 sse2_combine_xor_ca (pixman_implementation_t *imp,
2226                      pixman_op_t              op,
2227                      uint32_t *               pd,
2228                      const uint32_t *         ps,
2229                      const uint32_t *         pm,
2230                      int                      w)
2231 {
2232     uint32_t s, m, d;
2233
2234     __m128i xmm_src_lo, xmm_src_hi;
2235     __m128i xmm_dst_lo, xmm_dst_hi;
2236     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2237     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2238     __m128i xmm_mask_lo, xmm_mask_hi;
2239
2240     while (w && (unsigned long)pd & 15)
2241     {
2242         s = *ps++;
2243         m = *pm++;
2244         d = *pd;
2245
2246         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2247         w--;
2248     }
2249
2250     while (w >= 4)
2251     {
2252         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2253         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2254         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2255
2256         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2257         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2258         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2259
2260         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2261                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2262         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2263                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2264
2265         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2266                             &xmm_mask_lo, &xmm_mask_hi,
2267                             &xmm_src_lo, &xmm_src_hi);
2268         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2269                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2270                             &xmm_mask_lo, &xmm_mask_hi);
2271
2272         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2273                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2274         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2275                       &xmm_mask_lo, &xmm_mask_hi);
2276
2277         pix_add_multiply_2x128 (
2278             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2279             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2280             &xmm_dst_lo, &xmm_dst_hi);
2281
2282         save_128_aligned (
2283             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2284
2285         ps += 4;
2286         pd += 4;
2287         pm += 4;
2288         w -= 4;
2289     }
2290
2291     while (w)
2292     {
2293         s = *ps++;
2294         m = *pm++;
2295         d = *pd;
2296
2297         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2298         w--;
2299     }
2300 }
2301
2302 static void
2303 sse2_combine_add_ca (pixman_implementation_t *imp,
2304                      pixman_op_t              op,
2305                      uint32_t *               pd,
2306                      const uint32_t *         ps,
2307                      const uint32_t *         pm,
2308                      int                      w)
2309 {
2310     uint32_t s, m, d;
2311
2312     __m128i xmm_src_lo, xmm_src_hi;
2313     __m128i xmm_dst_lo, xmm_dst_hi;
2314     __m128i xmm_mask_lo, xmm_mask_hi;
2315
2316     while (w && (unsigned long)pd & 15)
2317     {
2318         s = *ps++;
2319         m = *pm++;
2320         d = *pd;
2321
2322         *pd++ = pack_1x128_32 (
2323             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2324                                                unpack_32_1x128 (m)),
2325                            unpack_32_1x128 (d)));
2326         w--;
2327     }
2328
2329     while (w >= 4)
2330     {
2331         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2332         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2333         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2334
2335         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2336         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2337         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2338
2339         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2340                             &xmm_mask_lo, &xmm_mask_hi,
2341                             &xmm_src_lo, &xmm_src_hi);
2342
2343         save_128_aligned (
2344             (__m128i*)pd, pack_2x128_128 (
2345                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2346                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2347
2348         ps += 4;
2349         pd += 4;
2350         pm += 4;
2351         w -= 4;
2352     }
2353
2354     while (w)
2355     {
2356         s = *ps++;
2357         m = *pm++;
2358         d = *pd;
2359
2360         *pd++ = pack_1x128_32 (
2361             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2362                                                unpack_32_1x128 (m)),
2363                            unpack_32_1x128 (d)));
2364         w--;
2365     }
2366 }
2367
2368 /* ---------------------------------------------------
2369  * fb_compose_setup_sSE2
2370  */
2371 static force_inline __m128i
2372 create_mask_16_128 (uint16_t mask)
2373 {
2374     return _mm_set1_epi16 (mask);
2375 }
2376
2377 /* Work around a code generation bug in Sun Studio 12. */
2378 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2379 # define create_mask_2x32_128(mask0, mask1)                             \
2380     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2381 #else
2382 static force_inline __m128i
2383 create_mask_2x32_128 (uint32_t mask0,
2384                       uint32_t mask1)
2385 {
2386     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2387 }
2388 #endif
2389
2390 /* -------------------------------------------------------------------
2391  * composite_over_n_8888
2392  */
2393
2394 static void
2395 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2396                             pixman_op_t              op,
2397                             pixman_image_t *         src_image,
2398                             pixman_image_t *         mask_image,
2399                             pixman_image_t *         dst_image,
2400                             int32_t                  src_x,
2401                             int32_t                  src_y,
2402                             int32_t                  mask_x,
2403                             int32_t                  mask_y,
2404                             int32_t                  dest_x,
2405                             int32_t                  dest_y,
2406                             int32_t                  width,
2407                             int32_t                  height)
2408 {
2409     uint32_t src;
2410     uint32_t    *dst_line, *dst, d;
2411     int32_t w;
2412     int dst_stride;
2413     __m128i xmm_src, xmm_alpha;
2414     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2415
2416     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2417
2418     if (src == 0)
2419         return;
2420
2421     PIXMAN_IMAGE_GET_LINE (
2422         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2423
2424     xmm_src = expand_pixel_32_1x128 (src);
2425     xmm_alpha = expand_alpha_1x128 (xmm_src);
2426
2427     while (height--)
2428     {
2429         dst = dst_line;
2430
2431         dst_line += dst_stride;
2432         w = width;
2433
2434         while (w && (unsigned long)dst & 15)
2435         {
2436             d = *dst;
2437             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2438                                                 xmm_alpha,
2439                                                 unpack_32_1x128 (d)));
2440             w--;
2441         }
2442
2443         while (w >= 4)
2444         {
2445             xmm_dst = load_128_aligned ((__m128i*)dst);
2446
2447             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2448
2449             over_2x128 (&xmm_src, &xmm_src,
2450                         &xmm_alpha, &xmm_alpha,
2451                         &xmm_dst_lo, &xmm_dst_hi);
2452
2453             /* rebuid the 4 pixel data and save*/
2454             save_128_aligned (
2455                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2456
2457             w -= 4;
2458             dst += 4;
2459         }
2460
2461         while (w)
2462         {
2463             d = *dst;
2464             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2465                                                 xmm_alpha,
2466                                                 unpack_32_1x128 (d)));
2467             w--;
2468         }
2469
2470     }
2471 }
2472
2473 /* ---------------------------------------------------------------------
2474  * composite_over_n_0565
2475  */
2476 static void
2477 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2478                             pixman_op_t              op,
2479                             pixman_image_t *         src_image,
2480                             pixman_image_t *         mask_image,
2481                             pixman_image_t *         dst_image,
2482                             int32_t                  src_x,
2483                             int32_t                  src_y,
2484                             int32_t                  mask_x,
2485                             int32_t                  mask_y,
2486                             int32_t                  dest_x,
2487                             int32_t                  dest_y,
2488                             int32_t                  width,
2489                             int32_t                  height)
2490 {
2491     uint32_t src;
2492     uint16_t    *dst_line, *dst, d;
2493     int32_t w;
2494     int dst_stride;
2495     __m128i xmm_src, xmm_alpha;
2496     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2497
2498     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2499
2500     if (src == 0)
2501         return;
2502
2503     PIXMAN_IMAGE_GET_LINE (
2504         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2505
2506     xmm_src = expand_pixel_32_1x128 (src);
2507     xmm_alpha = expand_alpha_1x128 (xmm_src);
2508
2509     while (height--)
2510     {
2511         dst = dst_line;
2512
2513         dst_line += dst_stride;
2514         w = width;
2515
2516         while (w && (unsigned long)dst & 15)
2517         {
2518             d = *dst;
2519
2520             *dst++ = pack_565_32_16 (
2521                 pack_1x128_32 (over_1x128 (xmm_src,
2522                                            xmm_alpha,
2523                                            expand565_16_1x128 (d))));
2524             w--;
2525         }
2526
2527         while (w >= 8)
2528         {
2529             xmm_dst = load_128_aligned ((__m128i*)dst);
2530
2531             unpack_565_128_4x128 (xmm_dst,
2532                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2533
2534             over_2x128 (&xmm_src, &xmm_src,
2535                         &xmm_alpha, &xmm_alpha,
2536                         &xmm_dst0, &xmm_dst1);
2537             over_2x128 (&xmm_src, &xmm_src,
2538                         &xmm_alpha, &xmm_alpha,
2539                         &xmm_dst2, &xmm_dst3);
2540
2541             xmm_dst = pack_565_4x128_128 (
2542                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2543
2544             save_128_aligned ((__m128i*)dst, xmm_dst);
2545
2546             dst += 8;
2547             w -= 8;
2548         }
2549
2550         while (w--)
2551         {
2552             d = *dst;
2553             *dst++ = pack_565_32_16 (
2554                 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2555                                            expand565_16_1x128 (d))));
2556         }
2557     }
2558
2559 }
2560
2561 /* ------------------------------
2562  * composite_add_n_8888_8888_ca
2563  */
2564 static void
2565 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2566                                    pixman_op_t              op,
2567                                    pixman_image_t *         src_image,
2568                                    pixman_image_t *         mask_image,
2569                                    pixman_image_t *         dst_image,
2570                                    int32_t                  src_x,
2571                                    int32_t                  src_y,
2572                                    int32_t                  mask_x,
2573                                    int32_t                  mask_y,
2574                                    int32_t                  dest_x,
2575                                    int32_t                  dest_y,
2576                                    int32_t                  width,
2577                                    int32_t                  height)
2578 {
2579     uint32_t src, srca;
2580     uint32_t    *dst_line, d;
2581     uint32_t    *mask_line, m;
2582     uint32_t pack_cmp;
2583     int dst_stride, mask_stride;
2584
2585     __m128i xmm_src, xmm_alpha;
2586     __m128i xmm_dst;
2587     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2588
2589     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2590
2591     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2592     srca = src >> 24;
2593
2594     if (src == 0)
2595         return;
2596
2597     PIXMAN_IMAGE_GET_LINE (
2598         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2599     PIXMAN_IMAGE_GET_LINE (
2600         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2601
2602     xmm_src = _mm_unpacklo_epi8 (
2603         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2604     xmm_alpha = expand_alpha_1x128 (xmm_src);
2605     mmx_src   = xmm_src;
2606     mmx_alpha = xmm_alpha;
2607
2608     while (height--)
2609     {
2610         int w = width;
2611         const uint32_t *pm = (uint32_t *)mask_line;
2612         uint32_t *pd = (uint32_t *)dst_line;
2613
2614         dst_line += dst_stride;
2615         mask_line += mask_stride;
2616
2617         while (w && (unsigned long)pd & 15)
2618         {
2619             m = *pm++;
2620
2621             if (m)
2622             {
2623                 d = *pd;
2624
2625                 mmx_mask = unpack_32_1x128 (m);
2626                 mmx_dest = unpack_32_1x128 (d);
2627
2628                 *pd = pack_1x128_32 (
2629                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2630             }
2631
2632             pd++;
2633             w--;
2634         }
2635
2636         while (w >= 4)
2637         {
2638             xmm_mask = load_128_unaligned ((__m128i*)pm);
2639
2640             pack_cmp =
2641                 _mm_movemask_epi8 (
2642                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2643
2644             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2645             if (pack_cmp != 0xffff)
2646             {
2647                 xmm_dst = load_128_aligned ((__m128i*)pd);
2648
2649                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2650
2651                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2652                                     &xmm_mask_lo, &xmm_mask_hi,
2653                                     &xmm_mask_lo, &xmm_mask_hi);
2654                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2655
2656                 save_128_aligned (
2657                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2658             }
2659
2660             pd += 4;
2661             pm += 4;
2662             w -= 4;
2663         }
2664
2665         while (w)
2666         {
2667             m = *pm++;
2668
2669             if (m)
2670             {
2671                 d = *pd;
2672
2673                 mmx_mask = unpack_32_1x128 (m);
2674                 mmx_dest = unpack_32_1x128 (d);
2675
2676                 *pd = pack_1x128_32 (
2677                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2678             }
2679
2680             pd++;
2681             w--;
2682         }
2683     }
2684
2685 }
2686
2687 /* ---------------------------------------------------------------------------
2688  * composite_over_n_8888_8888_ca
2689  */
2690
2691 static void
2692 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2693                                     pixman_op_t              op,
2694                                     pixman_image_t *         src_image,
2695                                     pixman_image_t *         mask_image,
2696                                     pixman_image_t *         dst_image,
2697                                     int32_t                  src_x,
2698                                     int32_t                  src_y,
2699                                     int32_t                  mask_x,
2700                                     int32_t                  mask_y,
2701                                     int32_t                  dest_x,
2702                                     int32_t                  dest_y,
2703                                     int32_t                  width,
2704                                     int32_t                  height)
2705 {
2706     uint32_t src;
2707     uint32_t    *dst_line, d;
2708     uint32_t    *mask_line, m;
2709     uint32_t pack_cmp;
2710     int dst_stride, mask_stride;
2711
2712     __m128i xmm_src, xmm_alpha;
2713     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2714     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2715
2716     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2717
2718     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2719
2720     if (src == 0)
2721         return;
2722
2723     PIXMAN_IMAGE_GET_LINE (
2724         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2725     PIXMAN_IMAGE_GET_LINE (
2726         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2727
2728     xmm_src = _mm_unpacklo_epi8 (
2729         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2730     xmm_alpha = expand_alpha_1x128 (xmm_src);
2731     mmx_src   = xmm_src;
2732     mmx_alpha = xmm_alpha;
2733
2734     while (height--)
2735     {
2736         int w = width;
2737         const uint32_t *pm = (uint32_t *)mask_line;
2738         uint32_t *pd = (uint32_t *)dst_line;
2739
2740         dst_line += dst_stride;
2741         mask_line += mask_stride;
2742
2743         while (w && (unsigned long)pd & 15)
2744         {
2745             m = *pm++;
2746
2747             if (m)
2748             {
2749                 d = *pd;
2750                 mmx_mask = unpack_32_1x128 (m);
2751                 mmx_dest = unpack_32_1x128 (d);
2752
2753                 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2754                                                   &mmx_alpha,
2755                                                   &mmx_mask,
2756                                                   &mmx_dest));
2757             }
2758
2759             pd++;
2760             w--;
2761         }
2762
2763         while (w >= 4)
2764         {
2765             xmm_mask = load_128_unaligned ((__m128i*)pm);
2766
2767             pack_cmp =
2768                 _mm_movemask_epi8 (
2769                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2770
2771             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2772             if (pack_cmp != 0xffff)
2773             {
2774                 xmm_dst = load_128_aligned ((__m128i*)pd);
2775
2776                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2777                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2778
2779                 in_over_2x128 (&xmm_src, &xmm_src,
2780                                &xmm_alpha, &xmm_alpha,
2781                                &xmm_mask_lo, &xmm_mask_hi,
2782                                &xmm_dst_lo, &xmm_dst_hi);
2783
2784                 save_128_aligned (
2785                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2786             }
2787
2788             pd += 4;
2789             pm += 4;
2790             w -= 4;
2791         }
2792
2793         while (w)
2794         {
2795             m = *pm++;
2796
2797             if (m)
2798             {
2799                 d = *pd;
2800                 mmx_mask = unpack_32_1x128 (m);
2801                 mmx_dest = unpack_32_1x128 (d);
2802
2803                 *pd = pack_1x128_32 (
2804                     in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2805             }
2806
2807             pd++;
2808             w--;
2809         }
2810     }
2811
2812 }
2813
2814 /*---------------------------------------------------------------------
2815  * composite_over_8888_n_8888
2816  */
2817
2818 static void
2819 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2820                                  pixman_op_t              op,
2821                                  pixman_image_t *         src_image,
2822                                  pixman_image_t *         mask_image,
2823                                  pixman_image_t *         dst_image,
2824                                  int32_t                  src_x,
2825                                  int32_t                  src_y,
2826                                  int32_t                  mask_x,
2827                                  int32_t                  mask_y,
2828                                  int32_t                  dest_x,
2829                                  int32_t                  dest_y,
2830                                  int32_t                  width,
2831                                  int32_t                  height)
2832 {
2833     uint32_t    *dst_line, *dst;
2834     uint32_t    *src_line, *src;
2835     uint32_t mask;
2836     int32_t w;
2837     int dst_stride, src_stride;
2838
2839     __m128i xmm_mask;
2840     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2841     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2842     __m128i xmm_alpha_lo, xmm_alpha_hi;
2843
2844     PIXMAN_IMAGE_GET_LINE (
2845         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2846     PIXMAN_IMAGE_GET_LINE (
2847         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2848
2849     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2850
2851     xmm_mask = create_mask_16_128 (mask >> 24);
2852
2853     while (height--)
2854     {
2855         dst = dst_line;
2856         dst_line += dst_stride;
2857         src = src_line;
2858         src_line += src_stride;
2859         w = width;
2860
2861         while (w && (unsigned long)dst & 15)
2862         {
2863             uint32_t s = *src++;
2864
2865             if (s)
2866             {
2867                 uint32_t d = *dst;
2868                 
2869                 __m128i ms = unpack_32_1x128 (s);
2870                 __m128i alpha    = expand_alpha_1x128 (ms);
2871                 __m128i dest     = xmm_mask;
2872                 __m128i alpha_dst = unpack_32_1x128 (d);
2873                 
2874                 *dst = pack_1x128_32 (
2875                     in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2876             }
2877             dst++;
2878             w--;
2879         }
2880
2881         while (w >= 4)
2882         {
2883             xmm_src = load_128_unaligned ((__m128i*)src);
2884
2885             if (!is_zero (xmm_src))
2886             {
2887                 xmm_dst = load_128_aligned ((__m128i*)dst);
2888                 
2889                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2890                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2891                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2892                                     &xmm_alpha_lo, &xmm_alpha_hi);
2893                 
2894                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2895                                &xmm_alpha_lo, &xmm_alpha_hi,
2896                                &xmm_mask, &xmm_mask,
2897                                &xmm_dst_lo, &xmm_dst_hi);
2898                 
2899                 save_128_aligned (
2900                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2901             }
2902                 
2903             dst += 4;
2904             src += 4;
2905             w -= 4;
2906         }
2907
2908         while (w)
2909         {
2910             uint32_t s = *src++;
2911
2912             if (s)
2913             {
2914                 uint32_t d = *dst;
2915                 
2916                 __m128i ms = unpack_32_1x128 (s);
2917                 __m128i alpha = expand_alpha_1x128 (ms);
2918                 __m128i mask  = xmm_mask;
2919                 __m128i dest  = unpack_32_1x128 (d);
2920                 
2921                 *dst = pack_1x128_32 (
2922                     in_over_1x128 (&ms, &alpha, &mask, &dest));
2923             }
2924
2925             dst++;
2926             w--;
2927         }
2928     }
2929
2930 }
2931
2932 /*---------------------------------------------------------------------
2933  * composite_over_8888_n_8888
2934  */
2935
2936 static void
2937 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2938                               pixman_op_t              op,
2939                               pixman_image_t *         src_image,
2940                               pixman_image_t *         mask_image,
2941                               pixman_image_t *         dst_image,
2942                               int32_t                  src_x,
2943                               int32_t                  src_y,
2944                               int32_t                  mask_x,
2945                               int32_t                  mask_y,
2946                               int32_t                  dest_x,
2947                               int32_t                  dest_y,
2948                               int32_t                  width,
2949                               int32_t                  height)
2950 {
2951     uint32_t    *dst_line, *dst;
2952     uint32_t    *src_line, *src;
2953     int32_t w;
2954     int dst_stride, src_stride;
2955
2956
2957     PIXMAN_IMAGE_GET_LINE (
2958         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2959     PIXMAN_IMAGE_GET_LINE (
2960         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2961
2962     while (height--)
2963     {
2964         dst = dst_line;
2965         dst_line += dst_stride;
2966         src = src_line;
2967         src_line += src_stride;
2968         w = width;
2969
2970         while (w && (unsigned long)dst & 15)
2971         {
2972             *dst++ = *src++ | 0xff000000;
2973             w--;
2974         }
2975
2976         while (w >= 16)
2977         {
2978             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2979             
2980             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2981             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2982             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2983             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2984             
2985             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2986             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2987             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2988             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2989             
2990             dst += 16;
2991             src += 16;
2992             w -= 16;
2993         }
2994
2995         while (w)
2996         {
2997             *dst++ = *src++ | 0xff000000;
2998             w--;
2999         }
3000     }
3001
3002 }
3003
3004 /* ---------------------------------------------------------------------
3005  * composite_over_x888_n_8888
3006  */
3007 static void
3008 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3009                                  pixman_op_t              op,
3010                                  pixman_image_t *         src_image,
3011                                  pixman_image_t *         mask_image,
3012                                  pixman_image_t *         dst_image,
3013                                  int32_t                  src_x,
3014                                  int32_t                  src_y,
3015                                  int32_t                  mask_x,
3016                                  int32_t                  mask_y,
3017                                  int32_t                  dest_x,
3018                                  int32_t                  dest_y,
3019                                  int32_t                  width,
3020                                  int32_t                  height)
3021 {
3022     uint32_t    *dst_line, *dst;
3023     uint32_t    *src_line, *src;
3024     uint32_t mask;
3025     int dst_stride, src_stride;
3026     int32_t w;
3027
3028     __m128i xmm_mask, xmm_alpha;
3029     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3030     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3031
3032     PIXMAN_IMAGE_GET_LINE (
3033         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3034     PIXMAN_IMAGE_GET_LINE (
3035         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3036
3037     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3038
3039     xmm_mask = create_mask_16_128 (mask >> 24);
3040     xmm_alpha = mask_00ff;
3041
3042     while (height--)
3043     {
3044         dst = dst_line;
3045         dst_line += dst_stride;
3046         src = src_line;
3047         src_line += src_stride;
3048         w = width;
3049
3050         while (w && (unsigned long)dst & 15)
3051         {
3052             uint32_t s = (*src++) | 0xff000000;
3053             uint32_t d = *dst;
3054
3055             __m128i src   = unpack_32_1x128 (s);
3056             __m128i alpha = xmm_alpha;
3057             __m128i mask  = xmm_mask;
3058             __m128i dest  = unpack_32_1x128 (d);
3059
3060             *dst++ = pack_1x128_32 (
3061                 in_over_1x128 (&src, &alpha, &mask, &dest));
3062
3063             w--;
3064         }
3065
3066         while (w >= 4)
3067         {
3068             xmm_src = _mm_or_si128 (
3069                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3070             xmm_dst = load_128_aligned ((__m128i*)dst);
3071
3072             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3073             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3074
3075             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3076                            &xmm_alpha, &xmm_alpha,
3077                            &xmm_mask, &xmm_mask,
3078                            &xmm_dst_lo, &xmm_dst_hi);
3079
3080             save_128_aligned (
3081                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3082
3083             dst += 4;
3084             src += 4;
3085             w -= 4;
3086
3087         }
3088
3089         while (w)
3090         {
3091             uint32_t s = (*src++) | 0xff000000;
3092             uint32_t d = *dst;
3093
3094             __m128i src  = unpack_32_1x128 (s);
3095             __m128i alpha = xmm_alpha;
3096             __m128i mask  = xmm_mask;
3097             __m128i dest  = unpack_32_1x128 (d);
3098
3099             *dst++ = pack_1x128_32 (
3100                 in_over_1x128 (&src, &alpha, &mask, &dest));
3101
3102             w--;
3103         }
3104     }
3105
3106 }
3107
3108 /* --------------------------------------------------------------------
3109  * composite_over_8888_8888
3110  */
3111 static void
3112 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3113                                pixman_op_t              op,
3114                                pixman_image_t *         src_image,
3115                                pixman_image_t *         mask_image,
3116                                pixman_image_t *         dst_image,
3117                                int32_t                  src_x,
3118                                int32_t                  src_y,
3119                                int32_t                  mask_x,
3120                                int32_t                  mask_y,
3121                                int32_t                  dest_x,
3122                                int32_t                  dest_y,
3123                                int32_t                  width,
3124                                int32_t                  height)
3125 {
3126     int dst_stride, src_stride;
3127     uint32_t    *dst_line, *dst;
3128     uint32_t    *src_line, *src;
3129
3130     PIXMAN_IMAGE_GET_LINE (
3131         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3132     PIXMAN_IMAGE_GET_LINE (
3133         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3134
3135     dst = dst_line;
3136     src = src_line;
3137
3138     while (height--)
3139     {
3140         sse2_combine_over_u (imp, op, dst, src, NULL, width);
3141
3142         dst += dst_stride;
3143         src += src_stride;
3144     }
3145 }
3146
3147 /* ------------------------------------------------------------------
3148  * composite_over_8888_0565
3149  */
3150 static force_inline uint16_t
3151 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3152 {
3153     __m128i ms;
3154
3155     ms = unpack_32_1x128 (src);
3156     return pack_565_32_16 (
3157         pack_1x128_32 (
3158             over_1x128 (
3159                 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3160 }
3161
3162 static void
3163 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3164                                pixman_op_t              op,
3165                                pixman_image_t *         src_image,
3166                                pixman_image_t *         mask_image,
3167                                pixman_image_t *         dst_image,
3168                                int32_t                  src_x,
3169                                int32_t                  src_y,
3170                                int32_t                  mask_x,
3171                                int32_t                  mask_y,
3172                                int32_t                  dest_x,
3173                                int32_t                  dest_y,
3174                                int32_t                  width,
3175                                int32_t                  height)
3176 {
3177     uint16_t    *dst_line, *dst, d;
3178     uint32_t    *src_line, *src, s;
3179     int dst_stride, src_stride;
3180     int32_t w;
3181
3182     __m128i xmm_alpha_lo, xmm_alpha_hi;
3183     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3184     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3185
3186     PIXMAN_IMAGE_GET_LINE (
3187         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3188     PIXMAN_IMAGE_GET_LINE (
3189         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3190
3191 #if 0
3192     /* FIXME
3193      *
3194      * I copy the code from MMX one and keep the fixme.
3195      * If it's a problem there, probably is a problem here.
3196      */
3197     assert (src_image->drawable == mask_image->drawable);
3198 #endif
3199
3200     while (height--)
3201     {
3202         dst = dst_line;
3203         src = src_line;
3204
3205         dst_line += dst_stride;
3206         src_line += src_stride;
3207         w = width;
3208
3209         /* Align dst on a 16-byte boundary */
3210         while (w &&
3211                ((unsigned long)dst & 15))
3212         {
3213             s = *src++;
3214             d = *dst;
3215
3216             *dst++ = composite_over_8888_0565pixel (s, d);
3217             w--;
3218         }
3219
3220         /* It's a 8 pixel loop */
3221         while (w >= 8)
3222         {
3223             /* I'm loading unaligned because I'm not sure
3224              * about the address alignment.
3225              */
3226             xmm_src = load_128_unaligned ((__m128i*) src);
3227             xmm_dst = load_128_aligned ((__m128i*) dst);
3228
3229             /* Unpacking */
3230             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3231             unpack_565_128_4x128 (xmm_dst,
3232                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3233             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3234                                 &xmm_alpha_lo, &xmm_alpha_hi);
3235
3236             /* I'm loading next 4 pixels from memory
3237              * before to optimze the memory read.
3238              */
3239             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3240
3241             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3242                         &xmm_alpha_lo, &xmm_alpha_hi,
3243                         &xmm_dst0, &xmm_dst1);
3244
3245             /* Unpacking */
3246             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3247             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3248                                 &xmm_alpha_lo, &xmm_alpha_hi);
3249
3250             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3251                         &xmm_alpha_lo, &xmm_alpha_hi,
3252                         &xmm_dst2, &xmm_dst3);
3253
3254             save_128_aligned (
3255                 (__m128i*)dst, pack_565_4x128_128 (
3256                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3257
3258             w -= 8;
3259             dst += 8;
3260             src += 8;
3261         }
3262
3263         while (w--)
3264         {
3265             s = *src++;
3266             d = *dst;
3267
3268             *dst++ = composite_over_8888_0565pixel (s, d);
3269         }
3270     }
3271
3272 }
3273
3274 /* -----------------------------------------------------------------
3275  * composite_over_n_8_8888
3276  */
3277
3278 static void
3279 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3280                               pixman_op_t              op,
3281                               pixman_image_t *         src_image,
3282                               pixman_image_t *         mask_image,
3283                               pixman_image_t *         dst_image,
3284                               int32_t                  src_x,
3285                               int32_t                  src_y,
3286                               int32_t                  mask_x,
3287                               int32_t                  mask_y,
3288                               int32_t                  dest_x,
3289                               int32_t                  dest_y,
3290                               int32_t                  width,
3291                               int32_t                  height)
3292 {
3293     uint32_t src, srca;
3294     uint32_t *dst_line, *dst;
3295     uint8_t *mask_line, *mask;
3296     int dst_stride, mask_stride;
3297     int32_t w;
3298     uint32_t m, d;
3299
3300     __m128i xmm_src, xmm_alpha, xmm_def;
3301     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3302     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3303
3304     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3305
3306     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3307
3308     srca = src >> 24;
3309     if (src == 0)
3310         return;
3311
3312     PIXMAN_IMAGE_GET_LINE (
3313         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3314     PIXMAN_IMAGE_GET_LINE (
3315         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3316
3317     xmm_def = create_mask_2x32_128 (src, src);
3318     xmm_src = expand_pixel_32_1x128 (src);
3319     xmm_alpha = expand_alpha_1x128 (xmm_src);
3320     mmx_src   = xmm_src;
3321     mmx_alpha = xmm_alpha;
3322
3323     while (height--)
3324     {
3325         dst = dst_line;
3326         dst_line += dst_stride;
3327         mask = mask_line;
3328         mask_line += mask_stride;
3329         w = width;
3330
3331         while (w && (unsigned long)dst & 15)
3332         {
3333             uint8_t m = *mask++;
3334
3335             if (m)
3336             {
3337                 d = *dst;
3338                 mmx_mask = expand_pixel_8_1x128 (m);
3339                 mmx_dest = unpack_32_1x128 (d);
3340
3341                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3342                                                    &mmx_alpha,
3343                                                    &mmx_mask,
3344                                                    &mmx_dest));
3345             }
3346
3347             w--;
3348             dst++;
3349         }
3350
3351         while (w >= 4)
3352         {
3353             m = *((uint32_t*)mask);
3354
3355             if (srca == 0xff && m == 0xffffffff)
3356             {
3357                 save_128_aligned ((__m128i*)dst, xmm_def);
3358             }
3359             else if (m)
3360             {
3361                 xmm_dst = load_128_aligned ((__m128i*) dst);
3362                 xmm_mask = unpack_32_1x128 (m);
3363                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3364
3365                 /* Unpacking */
3366                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3367                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3368
3369                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3370                                         &xmm_mask_lo, &xmm_mask_hi);
3371
3372                 in_over_2x128 (&xmm_src, &xmm_src,
3373                                &xmm_alpha, &xmm_alpha,
3374                                &xmm_mask_lo, &xmm_mask_hi,
3375                                &xmm_dst_lo, &xmm_dst_hi);
3376
3377                 save_128_aligned (
3378                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3379             }
3380
3381             w -= 4;
3382             dst += 4;
3383             mask += 4;
3384         }
3385
3386         while (w)
3387         {
3388             uint8_t m = *mask++;
3389
3390             if (m)
3391             {
3392                 d = *dst;
3393                 mmx_mask = expand_pixel_8_1x128 (m);
3394                 mmx_dest = unpack_32_1x128 (d);
3395
3396                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3397                                                    &mmx_alpha,
3398                                                    &mmx_mask,
3399                                                    &mmx_dest));
3400             }
3401
3402             w--;
3403             dst++;
3404         }
3405     }
3406
3407 }
3408
3409 /* ----------------------------------------------------------------
3410  * composite_over_n_8_8888
3411  */
3412
3413 pixman_bool_t
3414 pixman_fill_sse2 (uint32_t *bits,
3415                   int       stride,
3416                   int       bpp,
3417                   int       x,
3418                   int       y,
3419                   int       width,
3420                   int       height,
3421                   uint32_t  data)
3422 {
3423     uint32_t byte_width;
3424     uint8_t         *byte_line;
3425
3426     __m128i xmm_def;
3427
3428     if (bpp == 8)
3429     {
3430         uint8_t b;
3431         uint16_t w;
3432
3433         stride = stride * (int) sizeof (uint32_t) / 1;
3434         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3435         byte_width = width;
3436         stride *= 1;
3437
3438         b = data & 0xff;
3439         w = (b << 8) | b;
3440         data = (w << 16) | w;
3441     }
3442     else if (bpp == 16)
3443     {
3444         stride = stride * (int) sizeof (uint32_t) / 2;
3445         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3446         byte_width = 2 * width;
3447         stride *= 2;
3448
3449         data = (data & 0xffff) * 0x00010001;
3450     }
3451     else if (bpp == 32)
3452     {
3453         stride = stride * (int) sizeof (uint32_t) / 4;
3454         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3455         byte_width = 4 * width;
3456         stride *= 4;
3457     }
3458     else
3459     {
3460         return FALSE;
3461     }
3462
3463     xmm_def = create_mask_2x32_128 (data, data);
3464
3465     while (height--)
3466     {
3467         int w;
3468         uint8_t *d = byte_line;
3469         byte_line += stride;
3470         w = byte_width;
3471
3472         while (w >= 1 && ((unsigned long)d & 1))
3473         {
3474             *(uint8_t *)d = data;
3475             w -= 1;
3476             d += 1;
3477         }
3478
3479         while (w >= 2 && ((unsigned long)d & 3))
3480         {
3481             *(uint16_t *)d = data;
3482             w -= 2;
3483             d += 2;
3484         }
3485
3486         while (w >= 4 && ((unsigned long)d & 15))
3487         {
3488             *(uint32_t *)d = data;
3489
3490             w -= 4;
3491             d += 4;
3492         }
3493
3494         while (w >= 128)
3495         {
3496             save_128_aligned ((__m128i*)(d),     xmm_def);
3497             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3498             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3499             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3500             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3501             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3502             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3503             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3504
3505             d += 128;
3506             w -= 128;
3507         }
3508
3509         if (w >= 64)
3510         {
3511             save_128_aligned ((__m128i*)(d),     xmm_def);
3512             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3513             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3514             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3515
3516             d += 64;
3517             w -= 64;
3518         }
3519
3520         if (w >= 32)
3521         {
3522             save_128_aligned ((__m128i*)(d),     xmm_def);
3523             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3524
3525             d += 32;
3526             w -= 32;
3527         }
3528
3529         if (w >= 16)
3530         {
3531             save_128_aligned ((__m128i*)(d),     xmm_def);
3532
3533             d += 16;
3534             w -= 16;
3535         }
3536
3537         while (w >= 4)
3538         {
3539             *(uint32_t *)d = data;
3540
3541             w -= 4;
3542             d += 4;
3543         }
3544
3545         if (w >= 2)
3546         {
3547             *(uint16_t *)d = data;
3548             w -= 2;
3549             d += 2;
3550         }
3551
3552         if (w >= 1)
3553         {
3554             *(uint8_t *)d = data;
3555             w -= 1;
3556             d += 1;
3557         }
3558     }
3559
3560     return TRUE;
3561 }
3562
3563 static void
3564 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3565                              pixman_op_t              op,
3566                              pixman_image_t *         src_image,
3567                              pixman_image_t *         mask_image,
3568                              pixman_image_t *         dst_image,
3569                              int32_t                  src_x,
3570                              int32_t                  src_y,
3571                              int32_t                  mask_x,
3572                              int32_t                  mask_y,
3573                              int32_t                  dest_x,
3574                              int32_t                  dest_y,
3575                              int32_t                  width,
3576                              int32_t                  height)
3577 {
3578     uint32_t src, srca;
3579     uint32_t    *dst_line, *dst;
3580     uint8_t     *mask_line, *mask;
3581     int dst_stride, mask_stride;
3582     int32_t w;
3583     uint32_t m;
3584
3585     __m128i xmm_src, xmm_def;
3586     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3587
3588     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3589
3590     srca = src >> 24;
3591     if (src == 0)
3592     {
3593         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3594                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
3595                           dest_x, dest_y, width, height, 0);
3596         return;
3597     }
3598
3599     PIXMAN_IMAGE_GET_LINE (
3600         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3601     PIXMAN_IMAGE_GET_LINE (
3602         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3603
3604     xmm_def = create_mask_2x32_128 (src, src);
3605     xmm_src = expand_pixel_32_1x128 (src);
3606
3607     while (height--)
3608     {
3609         dst = dst_line;
3610         dst_line += dst_stride;
3611         mask = mask_line;
3612         mask_line += mask_stride;
3613         w = width;
3614
3615         while (w && (unsigned long)dst & 15)
3616         {
3617             uint8_t m = *mask++;
3618
3619             if (m)
3620             {
3621                 *dst = pack_1x128_32 (
3622                     pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3623             }
3624             else
3625             {
3626                 *dst = 0;
3627             }
3628
3629             w--;
3630             dst++;
3631         }
3632
3633         while (w >= 4)
3634         {
3635             m = *((uint32_t*)mask);
3636
3637             if (srca == 0xff && m == 0xffffffff)
3638             {
3639                 save_128_aligned ((__m128i*)dst, xmm_def);
3640             }
3641             else if (m)
3642             {
3643                 xmm_mask = unpack_32_1x128 (m);
3644                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3645
3646                 /* Unpacking */
3647                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3648
3649                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3650                                         &xmm_mask_lo, &xmm_mask_hi);
3651
3652                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3653                                     &xmm_mask_lo, &xmm_mask_hi,
3654                                     &xmm_mask_lo, &xmm_mask_hi);
3655
3656                 save_128_aligned (
3657                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3658             }
3659             else
3660             {
3661                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3662             }
3663
3664             w -= 4;
3665             dst += 4;
3666             mask += 4;
3667         }
3668
3669         while (w)
3670         {
3671             uint8_t m = *mask++;
3672
3673             if (m)
3674             {
3675                 *dst = pack_1x128_32 (
3676                     pix_multiply_1x128 (
3677                         xmm_src, expand_pixel_8_1x128 (m)));
3678             }
3679             else
3680             {
3681                 *dst = 0;
3682             }
3683
3684             w--;
3685             dst++;
3686         }
3687     }
3688
3689 }
3690
3691 /*-----------------------------------------------------------------------
3692  * composite_over_n_8_0565
3693  */
3694
3695 static void
3696 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3697                               pixman_op_t              op,
3698                               pixman_image_t *         src_image,
3699                               pixman_image_t *         mask_image,
3700                               pixman_image_t *         dst_image,
3701                               int32_t                  src_x,
3702                               int32_t                  src_y,
3703                               int32_t                  mask_x,
3704                               int32_t                  mask_y,
3705                               int32_t                  dest_x,
3706                               int32_t                  dest_y,
3707                               int32_t                  width,
3708                               int32_t                  height)
3709 {
3710     uint32_t src, srca;
3711     uint16_t    *dst_line, *dst, d;
3712     uint8_t     *mask_line, *mask;
3713     int dst_stride, mask_stride;
3714     int32_t w;
3715     uint32_t m;
3716     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3717
3718     __m128i xmm_src, xmm_alpha;
3719     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3720     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3721
3722     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3723
3724     srca = src >> 24;
3725     if (src == 0)
3726         return;
3727
3728     PIXMAN_IMAGE_GET_LINE (
3729         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3730     PIXMAN_IMAGE_GET_LINE (
3731         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3732
3733     xmm_src = expand_pixel_32_1x128 (src);
3734     xmm_alpha = expand_alpha_1x128 (xmm_src);
3735     mmx_src = xmm_src;
3736     mmx_alpha = xmm_alpha;
3737
3738     while (height--)
3739     {
3740         dst = dst_line;
3741         dst_line += dst_stride;
3742         mask = mask_line;
3743         mask_line += mask_stride;
3744         w = width;
3745
3746         while (w && (unsigned long)dst & 15)
3747         {
3748             m = *mask++;
3749
3750             if (m)
3751             {
3752                 d = *dst;
3753                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3754                 mmx_dest = expand565_16_1x128 (d);
3755
3756                 *dst = pack_565_32_16 (
3757                     pack_1x128_32 (
3758                         in_over_1x128 (
3759                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3760             }
3761
3762             w--;
3763             dst++;
3764         }
3765
3766         while (w >= 8)
3767         {
3768             xmm_dst = load_128_aligned ((__m128i*) dst);
3769             unpack_565_128_4x128 (xmm_dst,
3770                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3771
3772             m = *((uint32_t*)mask);
3773             mask += 4;
3774
3775             if (m)
3776             {
3777                 xmm_mask = unpack_32_1x128 (m);
3778                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3779
3780                 /* Unpacking */
3781                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3782
3783                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3784                                         &xmm_mask_lo, &xmm_mask_hi);
3785
3786                 in_over_2x128 (&xmm_src, &xmm_src,
3787                                &xmm_alpha, &xmm_alpha,
3788                                &xmm_mask_lo, &xmm_mask_hi,
3789                                &xmm_dst0, &xmm_dst1);
3790             }
3791
3792             m = *((uint32_t*)mask);
3793             mask += 4;
3794
3795             if (m)
3796             {
3797                 xmm_mask = unpack_32_1x128 (m);
3798                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3799
3800                 /* Unpacking */
3801                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3802
3803                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3804                                         &xmm_mask_lo, &xmm_mask_hi);
3805                 in_over_2x128 (&xmm_src, &xmm_src,
3806                                &xmm_alpha, &xmm_alpha,
3807                                &xmm_mask_lo, &xmm_mask_hi,
3808                                &xmm_dst2, &xmm_dst3);
3809             }
3810
3811             save_128_aligned (
3812                 (__m128i*)dst, pack_565_4x128_128 (
3813                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3814
3815             w -= 8;
3816             dst += 8;
3817         }
3818
3819         while (w)
3820         {
3821             m = *mask++;
3822
3823             if (m)
3824             {
3825                 d = *dst;
3826                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3827                 mmx_dest = expand565_16_1x128 (d);
3828
3829                 *dst = pack_565_32_16 (
3830                     pack_1x128_32 (
3831                         in_over_1x128 (
3832                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3833             }
3834
3835             w--;
3836             dst++;
3837         }
3838     }
3839
3840 }
3841
3842 /* -----------------------------------------------------------------------
3843  * composite_over_pixbuf_0565
3844  */
3845
3846 static void
3847 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3848                                  pixman_op_t              op,
3849                                  pixman_image_t *         src_image,
3850                                  pixman_image_t *         mask_image,
3851                                  pixman_image_t *         dst_image,
3852                                  int32_t                  src_x,
3853                                  int32_t                  src_y,
3854                                  int32_t                  mask_x,
3855                                  int32_t                  mask_y,
3856                                  int32_t                  dest_x,
3857                                  int32_t                  dest_y,
3858                                  int32_t                  width,
3859                                  int32_t                  height)
3860 {
3861     uint16_t    *dst_line, *dst, d;
3862     uint32_t    *src_line, *src, s;
3863     int dst_stride, src_stride;
3864     int32_t w;
3865     uint32_t opaque, zero;
3866
3867     __m128i ms;
3868     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3869     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3870
3871     PIXMAN_IMAGE_GET_LINE (
3872         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3873     PIXMAN_IMAGE_GET_LINE (
3874         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3875
3876 #if 0
3877     /* FIXME
3878      *
3879      * I copy the code from MMX one and keep the fixme.
3880      * If it's a problem there, probably is a problem here.
3881      */
3882     assert (src_image->drawable == mask_image->drawable);
3883 #endif
3884
3885     while (height--)
3886     {
3887         dst = dst_line;
3888         dst_line += dst_stride;
3889         src = src_line;
3890         src_line += src_stride;
3891         w = width;
3892
3893         while (w && (unsigned long)dst & 15)
3894         {
3895             s = *src++;
3896             d = *dst;
3897
3898             ms = unpack_32_1x128 (s);
3899
3900             *dst++ = pack_565_32_16 (
3901                 pack_1x128_32 (
3902                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3903             w--;
3904         }
3905
3906         while (w >= 8)
3907         {
3908             /* First round */
3909             xmm_src = load_128_unaligned ((__m128i*)src);
3910             xmm_dst = load_128_aligned  ((__m128i*)dst);
3911
3912             opaque = is_opaque (xmm_src);
3913             zero = is_zero (xmm_src);
3914
3915             unpack_565_128_4x128 (xmm_dst,
3916                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3917             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3918
3919             /* preload next round*/
3920             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3921
3922             if (opaque)
3923             {
3924                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3925                                      &xmm_dst0, &xmm_dst1);
3926             }
3927             else if (!zero)
3928             {
3929                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3930                                         &xmm_dst0, &xmm_dst1);
3931             }
3932
3933             /* Second round */
3934             opaque = is_opaque (xmm_src);
3935             zero = is_zero (xmm_src);
3936
3937             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3938
3939             if (opaque)
3940             {
3941                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3942                                      &xmm_dst2, &xmm_dst3);
3943             }
3944             else if (!zero)
3945             {
3946                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3947                                         &xmm_dst2, &xmm_dst3);
3948             }
3949
3950             save_128_aligned (
3951                 (__m128i*)dst, pack_565_4x128_128 (
3952                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3953
3954             w -= 8;
3955             src += 8;
3956             dst += 8;
3957         }
3958
3959         while (w)
3960         {
3961             s = *src++;
3962             d = *dst;
3963
3964             ms = unpack_32_1x128 (s);
3965
3966             *dst++ = pack_565_32_16 (
3967                 pack_1x128_32 (
3968                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3969             w--;
3970         }
3971     }
3972
3973 }
3974
3975 /* -------------------------------------------------------------------------
3976  * composite_over_pixbuf_8888
3977  */
3978
3979 static void
3980 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3981                                  pixman_op_t              op,
3982                                  pixman_image_t *         src_image,
3983                                  pixman_image_t *         mask_image,
3984                                  pixman_image_t *         dst_image,
3985                                  int32_t                  src_x,
3986                                  int32_t                  src_y,
3987                                  int32_t                  mask_x,
3988                                  int32_t                  mask_y,
3989                                  int32_t                  dest_x,
3990                                  int32_t                  dest_y,
3991                                  int32_t                  width,
3992                                  int32_t                  height)
3993 {
3994     uint32_t    *dst_line, *dst, d;
3995     uint32_t    *src_line, *src, s;
3996     int dst_stride, src_stride;
3997     int32_t w;
3998     uint32_t opaque, zero;
3999
4000     __m128i xmm_src_lo, xmm_src_hi;
4001     __m128i xmm_dst_lo, xmm_dst_hi;
4002
4003     PIXMAN_IMAGE_GET_LINE (
4004         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4005     PIXMAN_IMAGE_GET_LINE (
4006         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4007
4008 #if 0
4009     /* FIXME
4010      *
4011      * I copy the code from MMX one and keep the fixme.
4012      * If it's a problem there, probably is a problem here.
4013      */
4014     assert (src_image->drawable == mask_image->drawable);
4015 #endif
4016
4017     while (height--)
4018     {
4019         dst = dst_line;
4020         dst_line += dst_stride;
4021         src = src_line;
4022         src_line += src_stride;
4023         w = width;
4024
4025         while (w && (unsigned long)dst & 15)
4026         {
4027             s = *src++;
4028             d = *dst;
4029
4030             *dst++ = pack_1x128_32 (
4031                 over_rev_non_pre_1x128 (
4032                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4033
4034             w--;
4035         }
4036
4037         while (w >= 4)
4038         {
4039             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4040
4041             opaque = is_opaque (xmm_src_hi);
4042             zero = is_zero (xmm_src_hi);
4043
4044             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4045
4046             if (opaque)
4047             {
4048                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4049                                      &xmm_dst_lo, &xmm_dst_hi);
4050
4051                 save_128_aligned (
4052                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4053             }
4054             else if (!zero)
4055             {
4056                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4057
4058                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4059
4060                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4061                                         &xmm_dst_lo, &xmm_dst_hi);
4062
4063                 save_128_aligned (
4064                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4065             }
4066
4067             w -= 4;
4068             dst += 4;
4069             src += 4;
4070         }
4071
4072         while (w)
4073         {
4074             s = *src++;
4075             d = *dst;
4076
4077             *dst++ = pack_1x128_32 (
4078                 over_rev_non_pre_1x128 (
4079                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4080
4081             w--;
4082         }
4083     }
4084
4085 }
4086
4087 /* -------------------------------------------------------------------------------------------------
4088  * composite_over_n_8888_0565_ca
4089  */
4090
4091 static void
4092 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4093                                     pixman_op_t              op,
4094                                     pixman_image_t *         src_image,
4095                                     pixman_image_t *         mask_image,
4096                                     pixman_image_t *         dst_image,
4097                                     int32_t                  src_x,
4098                                     int32_t                  src_y,
4099                                     int32_t                  mask_x,
4100                                     int32_t                  mask_y,
4101                                     int32_t                  dest_x,
4102                                     int32_t                  dest_y,
4103                                     int32_t                  width,
4104                                     int32_t                  height)
4105 {
4106     uint32_t src;
4107     uint16_t    *dst_line, *dst, d;
4108     uint32_t    *mask_line, *mask, m;
4109     int dst_stride, mask_stride;
4110     int w;
4111     uint32_t pack_cmp;
4112
4113     __m128i xmm_src, xmm_alpha;
4114     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4115     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4116
4117     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4118
4119     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4120
4121     if (src == 0)
4122         return;
4123
4124     PIXMAN_IMAGE_GET_LINE (
4125         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4126     PIXMAN_IMAGE_GET_LINE (
4127         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4128
4129     xmm_src = expand_pixel_32_1x128 (src);
4130     xmm_alpha = expand_alpha_1x128 (xmm_src);
4131     mmx_src = xmm_src;
4132     mmx_alpha = xmm_alpha;
4133
4134     while (height--)
4135     {
4136         w = width;
4137         mask = mask_line;
4138         dst = dst_line;
4139         mask_line += mask_stride;
4140         dst_line += dst_stride;
4141
4142         while (w && ((unsigned long)dst & 15))
4143         {
4144             m = *(uint32_t *) mask;
4145
4146             if (m)
4147             {
4148                 d = *dst;
4149                 mmx_mask = unpack_32_1x128 (m);
4150                 mmx_dest = expand565_16_1x128 (d);
4151
4152                 *dst = pack_565_32_16 (
4153                     pack_1x128_32 (
4154                         in_over_1x128 (
4155                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4156             }
4157
4158             w--;
4159             dst++;
4160             mask++;
4161         }
4162
4163         while (w >= 8)
4164         {
4165             /* First round */
4166             xmm_mask = load_128_unaligned ((__m128i*)mask);
4167             xmm_dst = load_128_aligned ((__m128i*)dst);
4168
4169             pack_cmp = _mm_movemask_epi8 (
4170                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4171
4172             unpack_565_128_4x128 (xmm_dst,
4173                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4174             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4175
4176             /* preload next round */
4177             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4178
4179             /* preload next round */
4180             if (pack_cmp != 0xffff)
4181             {
4182                 in_over_2x128 (&xmm_src, &xmm_src,
4183                                &xmm_alpha, &xmm_alpha,
4184                                &xmm_mask_lo, &xmm_mask_hi,
4185                                &xmm_dst0, &xmm_dst1);
4186             }
4187
4188             /* Second round */
4189             pack_cmp = _mm_movemask_epi8 (
4190                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4191
4192             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4193
4194             if (pack_cmp != 0xffff)
4195             {
4196                 in_over_2x128 (&xmm_src, &xmm_src,
4197                                &xmm_alpha, &xmm_alpha,
4198                                &xmm_mask_lo, &xmm_mask_hi,
4199                                &xmm_dst2, &xmm_dst3);
4200             }
4201
4202             save_128_aligned (
4203                 (__m128i*)dst, pack_565_4x128_128 (
4204                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4205
4206             w -= 8;
4207             dst += 8;
4208             mask += 8;
4209         }
4210
4211         while (w)
4212         {
4213             m = *(uint32_t *) mask;
4214
4215             if (m)
4216             {
4217                 d = *dst;
4218                 mmx_mask = unpack_32_1x128 (m);
4219                 mmx_dest = expand565_16_1x128 (d);
4220
4221                 *dst = pack_565_32_16 (
4222                     pack_1x128_32 (
4223                         in_over_1x128 (
4224                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4225             }
4226
4227             w--;
4228             dst++;
4229             mask++;
4230         }
4231     }
4232
4233 }
4234
4235 /* -----------------------------------------------------------------------
4236  * composite_in_n_8_8
4237  */
4238
4239 static void
4240 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4241                          pixman_op_t              op,
4242                          pixman_image_t *         src_image,
4243                          pixman_image_t *         mask_image,
4244                          pixman_image_t *         dst_image,
4245                          int32_t                  src_x,
4246                          int32_t                  src_y,
4247                          int32_t                  mask_x,
4248                          int32_t                  mask_y,
4249                          int32_t                  dest_x,
4250                          int32_t                  dest_y,
4251                          int32_t                  width,
4252                          int32_t                  height)
4253 {
4254     uint8_t     *dst_line, *dst;
4255     uint8_t     *mask_line, *mask;
4256     int dst_stride, mask_stride;
4257     uint32_t d, m;
4258     uint32_t src;
4259     uint8_t sa;
4260     int32_t w;
4261
4262     __m128i xmm_alpha;
4263     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4264     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4265
4266     PIXMAN_IMAGE_GET_LINE (
4267         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4268     PIXMAN_IMAGE_GET_LINE (
4269         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4270
4271     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4272
4273     sa = src >> 24;
4274
4275     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4276
4277     while (height--)
4278     {
4279         dst = dst_line;
4280         dst_line += dst_stride;
4281         mask = mask_line;
4282         mask_line += mask_stride;
4283         w = width;
4284
4285         while (w && ((unsigned long)dst & 15))
4286         {
4287             m = (uint32_t) *mask++;
4288             d = (uint32_t) *dst;
4289
4290             *dst++ = (uint8_t) pack_1x128_32 (
4291                 pix_multiply_1x128 (
4292                     pix_multiply_1x128 (xmm_alpha,
4293                                        unpack_32_1x128 (m)),
4294                     unpack_32_1x128 (d)));
4295             w--;
4296         }
4297
4298         while (w >= 16)
4299         {
4300             xmm_mask = load_128_unaligned ((__m128i*)mask);
4301             xmm_dst = load_128_aligned ((__m128i*)dst);
4302
4303             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4304             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4305
4306             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4307                                 &xmm_mask_lo, &xmm_mask_hi,
4308                                 &xmm_mask_lo, &xmm_mask_hi);
4309
4310             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4311                                 &xmm_dst_lo, &xmm_dst_hi,
4312                                 &xmm_dst_lo, &xmm_dst_hi);
4313
4314             save_128_aligned (
4315                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4316
4317             mask += 16;
4318             dst += 16;
4319             w -= 16;
4320         }
4321
4322         while (w)
4323         {
4324             m = (uint32_t) *mask++;
4325             d = (uint32_t) *dst;
4326
4327             *dst++ = (uint8_t) pack_1x128_32 (
4328                 pix_multiply_1x128 (
4329                     pix_multiply_1x128 (
4330                         xmm_alpha, unpack_32_1x128 (m)),
4331                     unpack_32_1x128 (d)));
4332             w--;
4333         }
4334     }
4335
4336 }
4337
4338 /* -----------------------------------------------------------------------
4339  * composite_in_n_8
4340  */
4341
4342 static void
4343 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4344                        pixman_op_t              op,
4345                        pixman_image_t *         src_image,
4346                        pixman_image_t *         mask_image,
4347                        pixman_image_t *         dst_image,
4348                        int32_t                  src_x,
4349                        int32_t                  src_y,
4350                        int32_t                  mask_x,
4351                        int32_t                  mask_y,
4352                        int32_t                  dest_x,
4353                        int32_t                  dest_y,
4354                        int32_t                  width,
4355                        int32_t                  height)
4356 {
4357     uint8_t     *dst_line, *dst;
4358     int dst_stride;
4359     uint32_t d;
4360     uint32_t src;
4361     int32_t w;
4362
4363     __m128i xmm_alpha;
4364     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4365
4366     PIXMAN_IMAGE_GET_LINE (
4367         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4368
4369     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4370
4371     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4372
4373     src = src >> 24;
4374
4375     if (src == 0xff)
4376         return;
4377
4378     if (src == 0x00)
4379     {
4380         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4381                      8, dest_x, dest_y, width, height, src);
4382
4383         return;
4384     }
4385
4386     while (height--)
4387     {
4388         dst = dst_line;
4389         dst_line += dst_stride;
4390         w = width;
4391
4392         while (w && ((unsigned long)dst & 15))
4393         {
4394             d = (uint32_t) *dst;
4395
4396             *dst++ = (uint8_t) pack_1x128_32 (
4397                 pix_multiply_1x128 (
4398                     xmm_alpha,
4399                     unpack_32_1x128 (d)));
4400             w--;
4401         }
4402
4403         while (w >= 16)
4404         {
4405             xmm_dst = load_128_aligned ((__m128i*)dst);
4406
4407             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4408             
4409             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4410                                 &xmm_dst_lo, &xmm_dst_hi,
4411                                 &xmm_dst_lo, &xmm_dst_hi);
4412
4413             save_128_aligned (
4414                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4415
4416             dst += 16;
4417             w -= 16;
4418         }
4419
4420         while (w)
4421         {
4422             d = (uint32_t) *dst;
4423
4424             *dst++ = (uint8_t) pack_1x128_32 (
4425                 pix_multiply_1x128 (
4426                     xmm_alpha,
4427                     unpack_32_1x128 (d)));
4428             w--;
4429         }
4430     }
4431
4432 }
4433
4434 /* ---------------------------------------------------------------------------
4435  * composite_in_8_8
4436  */
4437
4438 static void
4439 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4440                        pixman_op_t              op,
4441                        pixman_image_t *         src_image,
4442                        pixman_image_t *         mask_image,
4443                        pixman_image_t *         dst_image,
4444                        int32_t                  src_x,
4445                        int32_t                  src_y,
4446                        int32_t                  mask_x,
4447                        int32_t                  mask_y,
4448                        int32_t                  dest_x,
4449                        int32_t                  dest_y,
4450                        int32_t                  width,
4451                        int32_t                  height)
4452 {
4453     uint8_t     *dst_line, *dst;
4454     uint8_t     *src_line, *src;
4455     int src_stride, dst_stride;
4456     int32_t w;
4457     uint32_t s, d;
4458
4459     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4460     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4461
4462     PIXMAN_IMAGE_GET_LINE (
4463         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4464     PIXMAN_IMAGE_GET_LINE (
4465         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4466
4467     while (height--)
4468     {
4469         dst = dst_line;
4470         dst_line += dst_stride;
4471         src = src_line;
4472         src_line += src_stride;
4473         w = width;
4474
4475         while (w && ((unsigned long)dst & 15))
4476         {
4477             s = (uint32_t) *src++;
4478             d = (uint32_t) *dst;
4479
4480             *dst++ = (uint8_t) pack_1x128_32 (
4481                 pix_multiply_1x128 (
4482                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4483             w--;
4484         }
4485
4486         while (w >= 16)
4487         {
4488             xmm_src = load_128_unaligned ((__m128i*)src);
4489             xmm_dst = load_128_aligned ((__m128i*)dst);
4490
4491             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4492             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4493
4494             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4495                                 &xmm_dst_lo, &xmm_dst_hi,
4496                                 &xmm_dst_lo, &xmm_dst_hi);
4497
4498             save_128_aligned (
4499                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4500
4501             src += 16;
4502             dst += 16;
4503             w -= 16;
4504         }
4505
4506         while (w)
4507         {
4508             s = (uint32_t) *src++;
4509             d = (uint32_t) *dst;
4510
4511             *dst++ = (uint8_t) pack_1x128_32 (
4512                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4513             w--;
4514         }
4515     }
4516
4517 }
4518
4519 /* -------------------------------------------------------------------------
4520  * composite_add_n_8_8
4521  */
4522
4523 static void
4524 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4525                           pixman_op_t              op,
4526                           pixman_image_t *         src_image,
4527                           pixman_image_t *         mask_image,
4528                           pixman_image_t *         dst_image,
4529                           int32_t                  src_x,
4530                           int32_t                  src_y,
4531                           int32_t                  mask_x,
4532                           int32_t                  mask_y,
4533                           int32_t                  dest_x,
4534                           int32_t                  dest_y,
4535                           int32_t                  width,
4536                           int32_t                  height)
4537 {
4538     uint8_t     *dst_line, *dst;
4539     uint8_t     *mask_line, *mask;
4540     int dst_stride, mask_stride;
4541     int32_t w;
4542     uint32_t src;
4543     uint8_t sa;
4544     uint32_t m, d;
4545
4546     __m128i xmm_alpha;
4547     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4548     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4549
4550     PIXMAN_IMAGE_GET_LINE (
4551         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4552     PIXMAN_IMAGE_GET_LINE (
4553         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4554
4555     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4556
4557     sa = src >> 24;
4558
4559     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4560
4561     while (height--)
4562     {
4563         dst = dst_line;
4564         dst_line += dst_stride;
4565         mask = mask_line;
4566         mask_line += mask_stride;
4567         w = width;
4568
4569         while (w && ((unsigned long)dst & 15))
4570         {
4571             m = (uint32_t) *mask++;
4572             d = (uint32_t) *dst;
4573
4574             *dst++ = (uint8_t) pack_1x128_32 (
4575                 _mm_adds_epu16 (
4576                     pix_multiply_1x128 (
4577                         xmm_alpha, unpack_32_1x128 (m)),
4578                     unpack_32_1x128 (d)));
4579             w--;
4580         }
4581
4582         while (w >= 16)
4583         {
4584             xmm_mask = load_128_unaligned ((__m128i*)mask);
4585             xmm_dst = load_128_aligned ((__m128i*)dst);
4586
4587             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4588             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4589
4590             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4591                                 &xmm_mask_lo, &xmm_mask_hi,
4592                                 &xmm_mask_lo, &xmm_mask_hi);
4593
4594             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4595             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4596
4597             save_128_aligned (
4598                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4599
4600             mask += 16;
4601             dst += 16;
4602             w -= 16;
4603         }
4604
4605         while (w)
4606         {
4607             m = (uint32_t) *mask++;
4608             d = (uint32_t) *dst;
4609
4610             *dst++ = (uint8_t) pack_1x128_32 (
4611                 _mm_adds_epu16 (
4612                     pix_multiply_1x128 (
4613                         xmm_alpha, unpack_32_1x128 (m)),
4614                     unpack_32_1x128 (d)));
4615
4616             w--;
4617         }
4618     }
4619
4620 }
4621
4622 /* -------------------------------------------------------------------------
4623  * composite_add_n_8_8
4624  */
4625
4626 static void
4627 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4628                         pixman_op_t              op,
4629                         pixman_image_t *         src_image,
4630                         pixman_image_t *         mask_image,
4631                         pixman_image_t *         dst_image,
4632                         int32_t                  src_x,
4633                         int32_t                  src_y,
4634                         int32_t                  mask_x,
4635                         int32_t                  mask_y,
4636                         int32_t                  dest_x,
4637                         int32_t                  dest_y,
4638                         int32_t                  width,
4639                         int32_t                  height)
4640 {
4641     uint8_t     *dst_line, *dst;
4642     int dst_stride;
4643     int32_t w;
4644     uint32_t src;
4645
4646     __m128i xmm_src;
4647
4648     PIXMAN_IMAGE_GET_LINE (
4649         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4650
4651     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4652
4653     src >>= 24;
4654
4655     if (src == 0x00)
4656         return;
4657
4658     if (src == 0xff)
4659     {
4660         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4661                      8, dest_x, dest_y, width, height, 0xff);
4662
4663         return;
4664     }
4665
4666     src = (src << 24) | (src << 16) | (src << 8) | src;
4667     xmm_src = _mm_set_epi32 (src, src, src, src);
4668
4669     while (height--)
4670     {
4671         dst = dst_line;
4672         dst_line += dst_stride;
4673         w = width;
4674
4675         while (w && ((unsigned long)dst & 15))
4676         {
4677             *dst = (uint8_t)_mm_cvtsi128_si32 (
4678                 _mm_adds_epu8 (
4679                     xmm_src,
4680                     _mm_cvtsi32_si128 (*dst)));
4681
4682             w--;
4683             dst++;
4684         }
4685
4686         while (w >= 16)
4687         {
4688             save_128_aligned (
4689                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4690
4691             dst += 16;
4692             w -= 16;
4693         }
4694
4695         while (w)
4696         {
4697             *dst = (uint8_t)_mm_cvtsi128_si32 (
4698                 _mm_adds_epu8 (
4699                     xmm_src,
4700                     _mm_cvtsi32_si128 (*dst)));
4701
4702             w--;
4703             dst++;
4704         }
4705     }
4706
4707 }
4708
4709 /* ----------------------------------------------------------------------
4710  * composite_add_8_8
4711  */
4712
4713 static void
4714 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4715                         pixman_op_t              op,
4716                         pixman_image_t *         src_image,
4717                         pixman_image_t *         mask_image,
4718                         pixman_image_t *         dst_image,
4719                         int32_t                  src_x,
4720                         int32_t                  src_y,
4721                         int32_t                  mask_x,
4722                         int32_t                  mask_y,
4723                         int32_t                  dest_x,
4724                         int32_t                  dest_y,
4725                         int32_t                  width,
4726                         int32_t                  height)
4727 {
4728     uint8_t     *dst_line, *dst;
4729     uint8_t     *src_line, *src;
4730     int dst_stride, src_stride;
4731     int32_t w;
4732     uint16_t t;
4733
4734     PIXMAN_IMAGE_GET_LINE (
4735         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4736     PIXMAN_IMAGE_GET_LINE (
4737         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4738
4739     while (height--)
4740     {
4741         dst = dst_line;
4742         src = src_line;
4743
4744         dst_line += dst_stride;
4745         src_line += src_stride;
4746         w = width;
4747
4748         /* Small head */
4749         while (w && (unsigned long)dst & 3)
4750         {
4751             t = (*dst) + (*src++);
4752             *dst++ = t | (0 - (t >> 8));
4753             w--;
4754         }
4755
4756         sse2_combine_add_u (imp, op,
4757                             (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4758
4759         /* Small tail */
4760         dst += w & 0xfffc;
4761         src += w & 0xfffc;
4762
4763         w &= 3;
4764
4765         while (w)
4766         {
4767             t = (*dst) + (*src++);
4768             *dst++ = t | (0 - (t >> 8));
4769             w--;
4770         }
4771     }
4772
4773 }
4774
4775 /* ---------------------------------------------------------------------
4776  * composite_add_8888_8888
4777  */
4778 static void
4779 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4780                               pixman_op_t              op,
4781                               pixman_image_t *         src_image,
4782                               pixman_image_t *         mask_image,
4783                               pixman_image_t *         dst_image,
4784                               int32_t                  src_x,
4785                               int32_t                  src_y,
4786                               int32_t                  mask_x,
4787                               int32_t                  mask_y,
4788                               int32_t                  dest_x,
4789                               int32_t                  dest_y,
4790                               int32_t                  width,
4791                               int32_t                  height)
4792 {
4793     uint32_t    *dst_line, *dst;
4794     uint32_t    *src_line, *src;
4795     int dst_stride, src_stride;
4796
4797     PIXMAN_IMAGE_GET_LINE (
4798         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4799     PIXMAN_IMAGE_GET_LINE (
4800         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4801
4802     while (height--)
4803     {
4804         dst = dst_line;
4805         dst_line += dst_stride;
4806         src = src_line;
4807         src_line += src_stride;
4808
4809         sse2_combine_add_u (imp, op, dst, src, NULL, width);
4810     }
4811
4812 }
4813
4814 /* -------------------------------------------------------------------------------------------------
4815  * sse2_composite_copy_area
4816  */
4817
4818 static pixman_bool_t
4819 pixman_blt_sse2 (uint32_t *src_bits,
4820                  uint32_t *dst_bits,
4821                  int       src_stride,
4822                  int       dst_stride,
4823                  int       src_bpp,
4824                  int       dst_bpp,
4825                  int       src_x,
4826                  int       src_y,
4827                  int       dst_x,
4828                  int       dst_y,
4829                  int       width,
4830                  int       height)
4831 {
4832     uint8_t *   src_bytes;
4833     uint8_t *   dst_bytes;
4834     int byte_width;
4835
4836     if (src_bpp != dst_bpp)
4837         return FALSE;
4838
4839     if (src_bpp == 16)
4840     {
4841         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4842         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4843         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4844         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4845         byte_width = 2 * width;
4846         src_stride *= 2;
4847         dst_stride *= 2;
4848     }
4849     else if (src_bpp == 32)
4850     {
4851         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4852         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4853         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4854         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4855         byte_width = 4 * width;
4856         src_stride *= 4;
4857         dst_stride *= 4;
4858     }
4859     else
4860     {
4861         return FALSE;
4862     }
4863
4864     while (height--)
4865     {
4866         int w;
4867         uint8_t *s = src_bytes;
4868         uint8_t *d = dst_bytes;
4869         src_bytes += src_stride;
4870         dst_bytes += dst_stride;
4871         w = byte_width;
4872
4873         while (w >= 2 && ((unsigned long)d & 3))
4874         {
4875             *(uint16_t *)d = *(uint16_t *)s;
4876             w -= 2;
4877             s += 2;
4878             d += 2;
4879         }
4880
4881         while (w >= 4 && ((unsigned long)d & 15))
4882         {
4883             *(uint32_t *)d = *(uint32_t *)s;
4884
4885             w -= 4;
4886             s += 4;
4887             d += 4;
4888         }
4889
4890         while (w >= 64)
4891         {
4892             __m128i xmm0, xmm1, xmm2, xmm3;
4893
4894             xmm0 = load_128_unaligned ((__m128i*)(s));
4895             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4896             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4897             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4898
4899             save_128_aligned ((__m128i*)(d),    xmm0);
4900             save_128_aligned ((__m128i*)(d + 16), xmm1);
4901             save_128_aligned ((__m128i*)(d + 32), xmm2);
4902             save_128_aligned ((__m128i*)(d + 48), xmm3);
4903
4904             s += 64;
4905             d += 64;
4906             w -= 64;
4907         }
4908
4909         while (w >= 16)
4910         {
4911             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4912
4913             w -= 16;
4914             d += 16;
4915             s += 16;
4916         }
4917
4918         while (w >= 4)
4919         {
4920             *(uint32_t *)d = *(uint32_t *)s;
4921
4922             w -= 4;
4923             s += 4;
4924             d += 4;
4925         }
4926
4927         if (w >= 2)
4928         {
4929             *(uint16_t *)d = *(uint16_t *)s;
4930             w -= 2;
4931             s += 2;
4932             d += 2;
4933         }
4934     }
4935
4936
4937     return TRUE;
4938 }
4939
4940 static void
4941 sse2_composite_copy_area (pixman_implementation_t *imp,
4942                           pixman_op_t              op,
4943                           pixman_image_t *         src_image,
4944                           pixman_image_t *         mask_image,
4945                           pixman_image_t *         dst_image,
4946                           int32_t                  src_x,
4947                           int32_t                  src_y,
4948                           int32_t                  mask_x,
4949                           int32_t                  mask_y,
4950                           int32_t                  dest_x,
4951                           int32_t                  dest_y,
4952                           int32_t                  width,
4953                           int32_t                  height)
4954 {
4955     pixman_blt_sse2 (src_image->bits.bits,
4956                      dst_image->bits.bits,
4957                      src_image->bits.rowstride,
4958                      dst_image->bits.rowstride,
4959                      PIXMAN_FORMAT_BPP (src_image->bits.format),
4960                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
4961                      src_x, src_y, dest_x, dest_y, width, height);
4962 }
4963
4964 static void
4965 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4966                                  pixman_op_t              op,
4967                                  pixman_image_t *         src_image,
4968                                  pixman_image_t *         mask_image,
4969                                  pixman_image_t *         dst_image,
4970                                  int32_t                  src_x,
4971                                  int32_t                  src_y,
4972                                  int32_t                  mask_x,
4973                                  int32_t                  mask_y,
4974                                  int32_t                  dest_x,
4975                                  int32_t                  dest_y,
4976                                  int32_t                  width,
4977                                  int32_t                  height)
4978 {
4979     uint32_t    *src, *src_line, s;
4980     uint32_t    *dst, *dst_line, d;
4981     uint8_t         *mask, *mask_line;
4982     uint32_t m;
4983     int src_stride, mask_stride, dst_stride;
4984     int32_t w;
4985     __m128i ms;
4986
4987     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4988     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4989     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4990
4991     PIXMAN_IMAGE_GET_LINE (
4992         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4993     PIXMAN_IMAGE_GET_LINE (
4994         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4995     PIXMAN_IMAGE_GET_LINE (
4996         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4997
4998     while (height--)
4999     {
5000         src = src_line;
5001         src_line += src_stride;
5002         dst = dst_line;
5003         dst_line += dst_stride;
5004         mask = mask_line;
5005         mask_line += mask_stride;
5006
5007         w = width;
5008
5009         while (w && (unsigned long)dst & 15)
5010         {
5011             s = 0xff000000 | *src++;
5012             m = (uint32_t) *mask++;
5013             d = *dst;
5014             ms = unpack_32_1x128 (s);
5015
5016             if (m != 0xff)
5017             {
5018                 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
5019                 __m128i md = unpack_32_1x128 (d);
5020
5021                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
5022             }
5023
5024             *dst++ = pack_1x128_32 (ms);
5025             w--;
5026         }
5027
5028         while (w >= 4)
5029         {
5030             m = *(uint32_t*) mask;
5031             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5032
5033             if (m == 0xffffffff)
5034             {
5035                 save_128_aligned ((__m128i*)dst, xmm_src);
5036             }
5037             else
5038             {
5039                 xmm_dst = load_128_aligned ((__m128i*)dst);
5040
5041                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5042
5043                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5044                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5045                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5046
5047                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5048
5049                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5050
5051                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5052             }
5053
5054             src += 4;
5055             dst += 4;
5056             mask += 4;
5057             w -= 4;
5058         }
5059
5060         while (w)
5061         {
5062             m = (uint32_t) *mask++;
5063
5064             if (m)
5065             {
5066                 s = 0xff000000 | *src;
5067
5068                 if (m == 0xff)
5069                 {
5070                     *dst = s;
5071                 }
5072                 else
5073                 {
5074                     __m128i ma, md, ms;
5075
5076                     d = *dst;
5077
5078                     ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
5079                     md = unpack_32_1x128 (d);
5080                     ms = unpack_32_1x128 (s);
5081
5082                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
5083                 }
5084
5085             }
5086
5087             src++;
5088             dst++;
5089             w--;
5090         }
5091     }
5092
5093 }
5094
5095 static void
5096 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5097                                  pixman_op_t              op,
5098                                  pixman_image_t *         src_image,
5099                                  pixman_image_t *         mask_image,
5100                                  pixman_image_t *         dst_image,
5101                                  int32_t                  src_x,
5102                                  int32_t                  src_y,
5103                                  int32_t                  mask_x,
5104                                  int32_t                  mask_y,
5105                                  int32_t                  dest_x,
5106                                  int32_t                  dest_y,
5107                                  int32_t                  width,
5108                                  int32_t                  height)
5109 {
5110     uint32_t    *src, *src_line, s;
5111     uint32_t    *dst, *dst_line, d;
5112     uint8_t         *mask, *mask_line;
5113     uint32_t m;
5114     int src_stride, mask_stride, dst_stride;
5115     int32_t w;
5116
5117     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5118     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5119     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5120
5121     PIXMAN_IMAGE_GET_LINE (
5122         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5123     PIXMAN_IMAGE_GET_LINE (
5124         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5125     PIXMAN_IMAGE_GET_LINE (
5126         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5127
5128     while (height--)
5129     {
5130         src = src_line;
5131         src_line += src_stride;
5132         dst = dst_line;
5133         dst_line += dst_stride;
5134         mask = mask_line;
5135         mask_line += mask_stride;
5136
5137         w = width;
5138
5139         while (w && (unsigned long)dst & 15)
5140         {
5141             uint32_t sa;
5142
5143             s = *src++;
5144             m = (uint32_t) *mask++;
5145             d = *dst;
5146
5147             sa = s >> 24;
5148
5149             if (m)
5150             {
5151                 if (sa == 0xff && m == 0xff)
5152                 {
5153                     *dst = s;
5154                 }
5155                 else
5156                 {
5157                     __m128i ms, md, ma, msa;
5158
5159                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5160                     ms = unpack_32_1x128 (s);
5161                     md = unpack_32_1x128 (d);
5162
5163                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5164
5165                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5166                 }
5167             }
5168
5169             dst++;
5170             w--;
5171         }
5172
5173         while (w >= 4)
5174         {
5175             m = *(uint32_t *) mask;
5176
5177             if (m)
5178             {
5179                 xmm_src = load_128_unaligned ((__m128i*)src);
5180
5181                 if (m == 0xffffffff && is_opaque (xmm_src))
5182                 {
5183                     save_128_aligned ((__m128i *)dst, xmm_src);
5184                 }
5185                 else
5186                 {
5187                     xmm_dst = load_128_aligned ((__m128i *)dst);
5188
5189                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5190
5191                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5192                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5193                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5194
5195                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5196                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5197
5198                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5199                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5200
5201                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5202                 }
5203             }
5204
5205             src += 4;
5206             dst += 4;
5207             mask += 4;
5208             w -= 4;
5209         }
5210
5211         while (w)
5212         {
5213             uint32_t sa;
5214
5215             s = *src++;
5216             m = (uint32_t) *mask++;
5217             d = *dst;
5218
5219             sa = s >> 24;
5220
5221             if (m)
5222             {
5223                 if (sa == 0xff && m == 0xff)
5224                 {
5225                     *dst = s;
5226                 }
5227                 else
5228                 {
5229                     __m128i ms, md, ma, msa;
5230
5231                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5232                     ms = unpack_32_1x128 (s);
5233                     md = unpack_32_1x128 (d);
5234
5235                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5236
5237                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5238                 }
5239             }
5240
5241             dst++;
5242             w--;
5243         }
5244     }
5245
5246 }
5247
5248 static void
5249 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5250                                     pixman_op_t              op,
5251                                     pixman_image_t *         src_image,
5252                                     pixman_image_t *         mask_image,
5253                                     pixman_image_t *         dst_image,
5254                                     int32_t                  src_x,
5255                                     int32_t                  src_y,
5256                                     int32_t                  mask_x,
5257                                     int32_t                  mask_y,
5258                                     int32_t                  dest_x,
5259                                     int32_t                  dest_y,
5260                                     int32_t                  width,
5261                                     int32_t                  height)
5262 {
5263     uint32_t src;
5264     uint32_t    *dst_line, *dst;
5265     __m128i xmm_src;
5266     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5267     __m128i xmm_dsta_hi, xmm_dsta_lo;
5268     int dst_stride;
5269     int32_t w;
5270
5271     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5272
5273     if (src == 0)
5274         return;
5275
5276     PIXMAN_IMAGE_GET_LINE (
5277         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5278
5279     xmm_src = expand_pixel_32_1x128 (src);
5280
5281     while (height--)
5282     {
5283         dst = dst_line;
5284
5285         dst_line += dst_stride;
5286         w = width;
5287
5288         while (w && (unsigned long)dst & 15)
5289         {
5290             __m128i vd;
5291
5292             vd = unpack_32_1x128 (*dst);
5293
5294             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5295                                               xmm_src));
5296             w--;
5297             dst++;
5298         }
5299
5300         while (w >= 4)
5301         {
5302             __m128i tmp_lo, tmp_hi;
5303
5304             xmm_dst = load_128_aligned ((__m128i*)dst);
5305
5306             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5307             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5308
5309             tmp_lo = xmm_src;
5310             tmp_hi = xmm_src;
5311
5312             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5313                         &xmm_dsta_lo, &xmm_dsta_hi,
5314                         &tmp_lo, &tmp_hi);
5315
5316             save_128_aligned (
5317                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5318
5319             w -= 4;
5320             dst += 4;
5321         }
5322
5323         while (w)
5324         {
5325             __m128i vd;
5326
5327             vd = unpack_32_1x128 (*dst);
5328
5329             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5330                                               xmm_src));
5331             w--;
5332             dst++;
5333         }
5334
5335     }
5336
5337 }
5338
5339 static void
5340 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5341                                     pixman_op_t              op,
5342                                     pixman_image_t *         src_image,
5343                                     pixman_image_t *         mask_image,
5344                                     pixman_image_t *         dst_image,
5345                                     int32_t                  src_x,
5346                                     int32_t                  src_y,
5347                                     int32_t                  mask_x,
5348                                     int32_t                  mask_y,
5349                                     int32_t                  dest_x,
5350                                     int32_t                  dest_y,
5351                                     int32_t                  width,
5352                                     int32_t                  height)
5353 {
5354     uint32_t    *src, *src_line, s;
5355     uint32_t    *dst, *dst_line, d;
5356     uint32_t    *mask, *mask_line;
5357     uint32_t    m;
5358     int src_stride, mask_stride, dst_stride;
5359     int32_t w;
5360
5361     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5362     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5363     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5364
5365     PIXMAN_IMAGE_GET_LINE (
5366         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5367     PIXMAN_IMAGE_GET_LINE (
5368         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5369     PIXMAN_IMAGE_GET_LINE (
5370         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5371
5372     while (height--)
5373     {
5374         src = src_line;
5375         src_line += src_stride;
5376         dst = dst_line;
5377         dst_line += dst_stride;
5378         mask = mask_line;
5379         mask_line += mask_stride;
5380
5381         w = width;
5382
5383         while (w && (unsigned long)dst & 15)
5384         {
5385             uint32_t sa;
5386
5387             s = *src++;
5388             m = (*mask++) >> 24;
5389             d = *dst;
5390
5391             sa = s >> 24;
5392
5393             if (m)
5394             {
5395                 if (sa == 0xff && m == 0xff)
5396                 {
5397                     *dst = s;
5398                 }
5399                 else
5400                 {
5401                     __m128i ms, md, ma, msa;
5402
5403                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5404                     ms = unpack_32_1x128 (s);
5405                     md = unpack_32_1x128 (d);
5406
5407                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5408
5409                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5410                 }
5411             }
5412
5413             dst++;
5414             w--;
5415         }
5416
5417         while (w >= 4)
5418         {
5419             xmm_mask = load_128_unaligned ((__m128i*)mask);
5420
5421             if (!is_transparent (xmm_mask))
5422             {
5423                 xmm_src = load_128_unaligned ((__m128i*)src);
5424
5425                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5426                 {
5427                     save_128_aligned ((__m128i *)dst, xmm_src);
5428                 }
5429                 else
5430                 {
5431                     xmm_dst = load_128_aligned ((__m128i *)dst);
5432
5433                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5434                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5435                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5436
5437                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5438                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5439
5440                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5441                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5442
5443                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5444                 }
5445             }
5446
5447             src += 4;
5448             dst += 4;
5449             mask += 4;
5450             w -= 4;
5451         }
5452
5453         while (w)
5454         {
5455             uint32_t sa;
5456
5457             s = *src++;
5458             m = (*mask++) >> 24;
5459             d = *dst;
5460
5461             sa = s >> 24;
5462
5463             if (m)
5464             {
5465                 if (sa == 0xff && m == 0xff)
5466                 {
5467                     *dst = s;
5468                 }
5469                 else
5470                 {
5471                     __m128i ms, md, ma, msa;
5472
5473                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5474                     ms = unpack_32_1x128 (s);
5475                     md = unpack_32_1x128 (d);
5476
5477                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5478
5479                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5480                 }
5481             }
5482
5483             dst++;
5484             w--;
5485         }
5486     }
5487
5488 }
5489
5490 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5491 static force_inline void
5492 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5493                                              const uint32_t* ps,
5494                                              int32_t         w,
5495                                              pixman_fixed_t  vx,
5496                                              pixman_fixed_t  unit_x,
5497                                              pixman_fixed_t  max_vx,
5498                                              pixman_bool_t   fully_transparent_src)
5499 {
5500     uint32_t s, d;
5501     const uint32_t* pm = NULL;
5502
5503     __m128i xmm_dst_lo, xmm_dst_hi;
5504     __m128i xmm_src_lo, xmm_src_hi;
5505     __m128i xmm_alpha_lo, xmm_alpha_hi;
5506
5507     if (fully_transparent_src)
5508         return;
5509
5510     /* Align dst on a 16-byte boundary */
5511     while (w && ((unsigned long)pd & 15))
5512     {
5513         d = *pd;
5514         s = combine1 (ps + (vx >> 16), pm);
5515         vx += unit_x;
5516
5517         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5518         if (pm)
5519             pm++;
5520         w--;
5521     }
5522
5523     while (w >= 4)
5524     {
5525         __m128i tmp;
5526         uint32_t tmp1, tmp2, tmp3, tmp4;
5527
5528         tmp1 = ps[vx >> 16];
5529         vx += unit_x;
5530         tmp2 = ps[vx >> 16];
5531         vx += unit_x;
5532         tmp3 = ps[vx >> 16];
5533         vx += unit_x;
5534         tmp4 = ps[vx >> 16];
5535         vx += unit_x;
5536
5537         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5538
5539         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5540
5541         if (is_opaque (xmm_src_hi))
5542         {
5543             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5544         }
5545         else if (!is_zero (xmm_src_hi))
5546         {
5547             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5548
5549             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5550             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5551
5552             expand_alpha_2x128 (
5553                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5554
5555             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5556                         &xmm_alpha_lo, &xmm_alpha_hi,
5557                         &xmm_dst_lo, &xmm_dst_hi);
5558
5559             /* rebuid the 4 pixel data and save*/
5560             save_128_aligned ((__m128i*)pd,
5561                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5562         }
5563
5564         w -= 4;
5565         pd += 4;
5566         if (pm)
5567             pm += 4;
5568     }
5569
5570     while (w)
5571     {
5572         d = *pd;
5573         s = combine1 (ps + (vx >> 16), pm);
5574         vx += unit_x;
5575
5576         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5577         if (pm)
5578             pm++;
5579
5580         w--;
5581     }
5582 }
5583
5584 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5585                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5586                        uint32_t, uint32_t, COVER)
5587 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5588                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5589                        uint32_t, uint32_t, NONE)
5590 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5591                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5592                        uint32_t, uint32_t, PAD)
5593
5594 static force_inline void
5595 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5596                                                uint32_t *       dst,
5597                                                const uint32_t * src,
5598                                                int32_t          w,
5599                                                pixman_fixed_t   vx,
5600                                                pixman_fixed_t   unit_x,
5601                                                pixman_fixed_t   max_vx,
5602                                                pixman_bool_t    zero_src)
5603 {
5604     __m128i xmm_mask;
5605     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5606     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5607     __m128i xmm_alpha_lo, xmm_alpha_hi;
5608
5609     if (zero_src || (*mask >> 24) == 0)
5610         return;
5611
5612     xmm_mask = create_mask_16_128 (*mask >> 24);
5613
5614     while (w && (unsigned long)dst & 15)
5615     {
5616         uint32_t s = src[pixman_fixed_to_int (vx)];
5617         vx += unit_x;
5618
5619         if (s)
5620         {
5621             uint32_t d = *dst;
5622
5623             __m128i ms = unpack_32_1x128 (s);
5624             __m128i alpha     = expand_alpha_1x128 (ms);
5625             __m128i dest      = xmm_mask;
5626             __m128i alpha_dst = unpack_32_1x128 (d);
5627
5628             *dst = pack_1x128_32 (
5629                 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5630         }
5631         dst++;
5632         w--;
5633     }
5634
5635     while (w >= 4)
5636     {
5637         uint32_t tmp1, tmp2, tmp3, tmp4;
5638
5639         tmp1 = src[pixman_fixed_to_int (vx)];
5640         vx += unit_x;
5641         tmp2 = src[pixman_fixed_to_int (vx)];
5642         vx += unit_x;
5643         tmp3 = src[pixman_fixed_to_int (vx)];
5644         vx += unit_x;
5645         tmp4 = src[pixman_fixed_to_int (vx)];
5646         vx += unit_x;
5647
5648         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5649
5650         if (!is_zero (xmm_src))
5651         {
5652             xmm_dst = load_128_aligned ((__m128i*)dst);
5653
5654             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5655             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5656             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5657                                 &xmm_alpha_lo, &xmm_alpha_hi);
5658
5659             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5660                            &xmm_alpha_lo, &xmm_alpha_hi,
5661                            &xmm_mask, &xmm_mask,
5662                            &xmm_dst_lo, &xmm_dst_hi);
5663
5664             save_128_aligned (
5665                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5666         }
5667
5668         dst += 4;
5669         w -= 4;
5670     }
5671
5672     while (w)
5673     {
5674         uint32_t s = src[pixman_fixed_to_int (vx)];
5675         vx += unit_x;
5676
5677         if (s)
5678         {
5679             uint32_t d = *dst;
5680
5681             __m128i ms = unpack_32_1x128 (s);
5682             __m128i alpha = expand_alpha_1x128 (ms);
5683             __m128i mask  = xmm_mask;
5684             __m128i dest  = unpack_32_1x128 (d);
5685
5686             *dst = pack_1x128_32 (
5687                 in_over_1x128 (&ms, &alpha, &mask, &dest));
5688         }
5689
5690         dst++;
5691         w--;
5692     }
5693
5694 }
5695
5696 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5697                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5698                               uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5699 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5700                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5701                               uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5702 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5703                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5704                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5705
5706 static const pixman_fast_path_t sse2_fast_paths[] =
5707 {
5708     /* PIXMAN_OP_OVER */
5709     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5710     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5711     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5712     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5713     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5714     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5715     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5716     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5717     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5718     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5719     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5720     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5721     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5722     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5723     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5724     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5725     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5726     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5727     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5728     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5729     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5730     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5731     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5732     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5733     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5734     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5735     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5736     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5737     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5738     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5739     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5740     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5741     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5742     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5743     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5744     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5745     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5746     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5747     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5748     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5749     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5750     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5751     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5752     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5753     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5754     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5755     
5756     /* PIXMAN_OP_OVER_REVERSE */
5757     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5758     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5759
5760     /* PIXMAN_OP_ADD */
5761     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5762     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5763     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5764     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5765     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5766     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5767
5768     /* PIXMAN_OP_SRC */
5769     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5770     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5771     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5772     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5773     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5774     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5775     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5776     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5777     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5778     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5779     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5780     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5781     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5782     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5783
5784     /* PIXMAN_OP_IN */
5785     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5786     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5787     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5788
5789     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5790     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5791     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5792     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5793     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5794     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5795     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5796     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5797     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5798     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5799     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5800     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5801
5802     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
5803     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
5804     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
5805     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
5806
5807     { PIXMAN_OP_NONE },
5808 };
5809
5810 static pixman_bool_t
5811 sse2_blt (pixman_implementation_t *imp,
5812           uint32_t *               src_bits,
5813           uint32_t *               dst_bits,
5814           int                      src_stride,
5815           int                      dst_stride,
5816           int                      src_bpp,
5817           int                      dst_bpp,
5818           int                      src_x,
5819           int                      src_y,
5820           int                      dst_x,
5821           int                      dst_y,
5822           int                      width,
5823           int                      height)
5824 {
5825     if (!pixman_blt_sse2 (
5826             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5827             src_x, src_y, dst_x, dst_y, width, height))
5828
5829     {
5830         return _pixman_implementation_blt (
5831             imp->delegate,
5832             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5833             src_x, src_y, dst_x, dst_y, width, height);
5834     }
5835
5836     return TRUE;
5837 }
5838
5839 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5840 __attribute__((__force_align_arg_pointer__))
5841 #endif
5842 static pixman_bool_t
5843 sse2_fill (pixman_implementation_t *imp,
5844            uint32_t *               bits,
5845            int                      stride,
5846            int                      bpp,
5847            int                      x,
5848            int                      y,
5849            int                      width,
5850            int                      height,
5851            uint32_t xor)
5852 {
5853     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5854     {
5855         return _pixman_implementation_fill (
5856             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5857     }
5858
5859     return TRUE;
5860 }
5861
5862 static uint32_t *
5863 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5864 {
5865     int w = iter->width;
5866     __m128i ff000000 = mask_ff000000;
5867     uint32_t *dst = iter->buffer;
5868     uint32_t *src = (uint32_t *)iter->bits;
5869
5870     iter->bits += iter->stride;
5871
5872     while (w && ((unsigned long)dst) & 0x0f)
5873     {
5874         *dst++ = (*src++) | 0xff000000;
5875         w--;
5876     }
5877
5878     while (w >= 4)
5879     {
5880         save_128_aligned (
5881             (__m128i *)dst, _mm_or_si128 (
5882                 load_128_unaligned ((__m128i *)src), ff000000));
5883
5884         dst += 4;
5885         src += 4;
5886         w -= 4;
5887     }
5888
5889     while (w)
5890     {
5891         *dst++ = (*src++) | 0xff000000;
5892         w--;
5893     }
5894
5895     return iter->buffer;
5896 }
5897
5898 static uint32_t *
5899 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
5900 {
5901     int w = iter->width;
5902     uint32_t *dst = iter->buffer;
5903     uint16_t *src = (uint16_t *)iter->bits;
5904     __m128i ff000000 = mask_ff000000;
5905
5906     iter->bits += iter->stride;
5907
5908     while (w && ((unsigned long)dst) & 0x0f)
5909     {
5910         uint16_t s = *src++;
5911
5912         *dst++ = CONVERT_0565_TO_8888 (s);
5913         w--;
5914     }
5915
5916     while (w >= 8)
5917     {
5918         __m128i lo, hi, s;
5919
5920         s = _mm_loadu_si128 ((__m128i *)src);
5921
5922         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
5923         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
5924
5925         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
5926         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
5927
5928         dst += 8;
5929         src += 8;
5930         w -= 8;
5931     }
5932
5933     while (w)
5934     {
5935         uint16_t s = *src++;
5936
5937         *dst++ = CONVERT_0565_TO_8888 (s);
5938         w--;
5939     }
5940
5941     return iter->buffer;
5942 }
5943
5944 static uint32_t *
5945 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5946 {
5947     int w = iter->width;
5948     uint32_t *dst = iter->buffer;
5949     uint8_t *src = iter->bits;
5950     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5951
5952     iter->bits += iter->stride;
5953
5954     while (w && (((unsigned long)dst) & 15))
5955     {
5956         *dst++ = *(src++) << 24;
5957         w--;
5958     }
5959
5960     while (w >= 16)
5961     {
5962         xmm0 = _mm_loadu_si128((__m128i *)src);
5963
5964         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
5965         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
5966         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
5967         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
5968         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
5969         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
5970
5971         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
5972         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
5973         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
5974         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
5975
5976         dst += 16;
5977         src += 16;
5978         w -= 16;
5979     }
5980
5981     while (w)
5982     {
5983         *dst++ = *(src++) << 24;
5984         w--;
5985     }
5986
5987     return iter->buffer;
5988 }
5989
5990 typedef struct
5991 {
5992     pixman_format_code_t        format;
5993     pixman_iter_get_scanline_t  get_scanline;
5994 } fetcher_info_t;
5995
5996 static const fetcher_info_t fetchers[] =
5997 {
5998     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
5999     { PIXMAN_r5g6b5,            sse2_fetch_r5g6b5 },
6000     { PIXMAN_a8,                sse2_fetch_a8 },
6001     { PIXMAN_null }
6002 };
6003
6004 static void
6005 sse2_src_iter_init (pixman_implementation_t *imp,
6006                     pixman_iter_t *iter,
6007                     pixman_image_t *image,
6008                     int x, int y, int width, int height,
6009                     uint8_t *buffer, iter_flags_t flags)
6010 {
6011 #define FLAGS                                                           \
6012     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
6013
6014     if ((flags & ITER_NARROW)                           &&
6015         (image->common.flags & FLAGS) == FLAGS          &&
6016         x >= 0 && y >= 0                                &&
6017         x + width <= image->bits.width                  &&
6018         y + height <= image->bits.height)
6019     {
6020         const fetcher_info_t *f;
6021
6022         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6023         {
6024             if (image->common.extended_format_code == f->format)
6025             {
6026                 uint8_t *b = (uint8_t *)image->bits.bits;
6027                 int s = image->bits.rowstride * 4;
6028
6029                 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6030                 iter->stride = s;
6031                 iter->width = width;
6032                 iter->buffer = (uint32_t *)buffer;
6033
6034                 iter->get_scanline = f->get_scanline;
6035                 return;
6036             }
6037         }
6038     }
6039
6040     _pixman_implementation_src_iter_init (
6041         imp->delegate, iter, image, x, y, width, height, buffer, flags);
6042 }
6043
6044 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6045 __attribute__((__force_align_arg_pointer__))
6046 #endif
6047 pixman_implementation_t *
6048 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6049 {
6050     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6051
6052     /* SSE2 constants */
6053     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6054     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6055     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6056     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6057     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6058     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6059     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6060     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6061     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6062     mask_0080 = create_mask_16_128 (0x0080);
6063     mask_00ff = create_mask_16_128 (0x00ff);
6064     mask_0101 = create_mask_16_128 (0x0101);
6065     mask_ffff = create_mask_16_128 (0xffff);
6066     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6067     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6068
6069
6070     /* Set up function pointers */
6071
6072     /* SSE code patch for fbcompose.c */
6073     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6074     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6075     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6076     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6077     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6078     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6079     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6080     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6081     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6082     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6083
6084     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6085
6086     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6087     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6088     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6089     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6090     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6091     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6092     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6093     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6094     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6095     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6096     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6097
6098     imp->blt = sse2_blt;
6099     imp->fill = sse2_fill;
6100
6101     imp->src_iter_init = sse2_src_iter_init;
6102
6103     return imp;
6104 }
6105
6106 #endif /* USE_SSE2 */