Add SSE2 fetcher for a8
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38 #include "pixman-fast-path.h"
39
40 #if defined(_MSC_VER) && defined(_M_AMD64)
41 /* Windows 64 doesn't allow MMX to be used, so
42  * the pixman-x64-mmx-emulation.h file contains
43  * implementations of those MMX intrinsics that
44  * are used in the SSE2 implementation.
45  */
46 #   include "pixman-x64-mmx-emulation.h"
47 #endif
48
49 #ifdef USE_SSE2
50
51 /* --------------------------------------------------------------------
52  * Locals
53  */
54
55 static __m64 mask_x0080;
56 static __m64 mask_x00ff;
57 static __m64 mask_x0101;
58 static __m64 mask_x_alpha;
59
60 static __m64 mask_x565_rgb;
61 static __m64 mask_x565_unpack;
62
63 static __m128i mask_0080;
64 static __m128i mask_00ff;
65 static __m128i mask_0101;
66 static __m128i mask_ffff;
67 static __m128i mask_ff000000;
68 static __m128i mask_alpha;
69
70 static __m128i mask_565_r;
71 static __m128i mask_565_g1, mask_565_g2;
72 static __m128i mask_565_b;
73 static __m128i mask_red;
74 static __m128i mask_green;
75 static __m128i mask_blue;
76
77 static __m128i mask_565_fix_rb;
78 static __m128i mask_565_fix_g;
79
80 /* ----------------------------------------------------------------------
81  * SSE2 Inlines
82  */
83 static force_inline __m128i
84 unpack_32_1x128 (uint32_t data)
85 {
86     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
87 }
88
89 static force_inline void
90 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
91 {
92     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
93     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
94 }
95
96 static force_inline __m128i
97 unpack_565_to_8888 (__m128i lo)
98 {
99     __m128i r, g, b, rb, t;
100
101     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
102     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
103     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
104
105     rb = _mm_or_si128 (r, b);
106     t  = _mm_and_si128 (rb, mask_565_fix_rb);
107     t  = _mm_srli_epi32 (t, 5);
108     rb = _mm_or_si128 (rb, t);
109
110     t  = _mm_and_si128 (g, mask_565_fix_g);
111     t  = _mm_srli_epi32 (t, 6);
112     g  = _mm_or_si128 (g, t);
113
114     return _mm_or_si128 (rb, g);
115 }
116
117 static force_inline void
118 unpack_565_128_4x128 (__m128i  data,
119                       __m128i* data0,
120                       __m128i* data1,
121                       __m128i* data2,
122                       __m128i* data3)
123 {
124     __m128i lo, hi;
125
126     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
127     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
128
129     lo = unpack_565_to_8888 (lo);
130     hi = unpack_565_to_8888 (hi);
131
132     unpack_128_2x128 (lo, data0, data1);
133     unpack_128_2x128 (hi, data2, data3);
134 }
135
136 static force_inline uint16_t
137 pack_565_32_16 (uint32_t pixel)
138 {
139     return (uint16_t) (((pixel >> 8) & 0xf800) |
140                        ((pixel >> 5) & 0x07e0) |
141                        ((pixel >> 3) & 0x001f));
142 }
143
144 static force_inline __m128i
145 pack_2x128_128 (__m128i lo, __m128i hi)
146 {
147     return _mm_packus_epi16 (lo, hi);
148 }
149
150 static force_inline __m128i
151 pack_565_2x128_128 (__m128i lo, __m128i hi)
152 {
153     __m128i data;
154     __m128i r, g1, g2, b;
155
156     data = pack_2x128_128 (lo, hi);
157
158     r  = _mm_and_si128 (data, mask_565_r);
159     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
160     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
161     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
162
163     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
164 }
165
166 static force_inline __m128i
167 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
168 {
169     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
170                              pack_565_2x128_128 (*xmm2, *xmm3));
171 }
172
173 static force_inline int
174 is_opaque (__m128i x)
175 {
176     __m128i ffs = _mm_cmpeq_epi8 (x, x);
177
178     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
179 }
180
181 static force_inline int
182 is_zero (__m128i x)
183 {
184     return _mm_movemask_epi8 (
185         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
186 }
187
188 static force_inline int
189 is_transparent (__m128i x)
190 {
191     return (_mm_movemask_epi8 (
192                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
193 }
194
195 static force_inline __m128i
196 expand_pixel_32_1x128 (uint32_t data)
197 {
198     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
199 }
200
201 static force_inline __m128i
202 expand_alpha_1x128 (__m128i data)
203 {
204     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
205                                                      _MM_SHUFFLE (3, 3, 3, 3)),
206                                 _MM_SHUFFLE (3, 3, 3, 3));
207 }
208
209 static force_inline void
210 expand_alpha_2x128 (__m128i  data_lo,
211                     __m128i  data_hi,
212                     __m128i* alpha_lo,
213                     __m128i* alpha_hi)
214 {
215     __m128i lo, hi;
216
217     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
218     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
219
220     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
221     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
222 }
223
224 static force_inline void
225 expand_alpha_rev_2x128 (__m128i  data_lo,
226                         __m128i  data_hi,
227                         __m128i* alpha_lo,
228                         __m128i* alpha_hi)
229 {
230     __m128i lo, hi;
231
232     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
233     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
234     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
235     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
236 }
237
238 static force_inline void
239 pix_multiply_2x128 (__m128i* data_lo,
240                     __m128i* data_hi,
241                     __m128i* alpha_lo,
242                     __m128i* alpha_hi,
243                     __m128i* ret_lo,
244                     __m128i* ret_hi)
245 {
246     __m128i lo, hi;
247
248     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
249     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
250     lo = _mm_adds_epu16 (lo, mask_0080);
251     hi = _mm_adds_epu16 (hi, mask_0080);
252     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
253     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
254 }
255
256 static force_inline void
257 pix_add_multiply_2x128 (__m128i* src_lo,
258                         __m128i* src_hi,
259                         __m128i* alpha_dst_lo,
260                         __m128i* alpha_dst_hi,
261                         __m128i* dst_lo,
262                         __m128i* dst_hi,
263                         __m128i* alpha_src_lo,
264                         __m128i* alpha_src_hi,
265                         __m128i* ret_lo,
266                         __m128i* ret_hi)
267 {
268     __m128i t1_lo, t1_hi;
269     __m128i t2_lo, t2_hi;
270
271     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
272     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
273
274     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
275     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
276 }
277
278 static force_inline void
279 negate_2x128 (__m128i  data_lo,
280               __m128i  data_hi,
281               __m128i* neg_lo,
282               __m128i* neg_hi)
283 {
284     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
285     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
286 }
287
288 static force_inline void
289 invert_colors_2x128 (__m128i  data_lo,
290                      __m128i  data_hi,
291                      __m128i* inv_lo,
292                      __m128i* inv_hi)
293 {
294     __m128i lo, hi;
295
296     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
297     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
298     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
299     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
300 }
301
302 static force_inline void
303 over_2x128 (__m128i* src_lo,
304             __m128i* src_hi,
305             __m128i* alpha_lo,
306             __m128i* alpha_hi,
307             __m128i* dst_lo,
308             __m128i* dst_hi)
309 {
310     __m128i t1, t2;
311
312     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
313
314     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
315
316     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
317     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
318 }
319
320 static force_inline void
321 over_rev_non_pre_2x128 (__m128i  src_lo,
322                         __m128i  src_hi,
323                         __m128i* dst_lo,
324                         __m128i* dst_hi)
325 {
326     __m128i lo, hi;
327     __m128i alpha_lo, alpha_hi;
328
329     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
330
331     lo = _mm_or_si128 (alpha_lo, mask_alpha);
332     hi = _mm_or_si128 (alpha_hi, mask_alpha);
333
334     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
335
336     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
337
338     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
339 }
340
341 static force_inline void
342 in_over_2x128 (__m128i* src_lo,
343                __m128i* src_hi,
344                __m128i* alpha_lo,
345                __m128i* alpha_hi,
346                __m128i* mask_lo,
347                __m128i* mask_hi,
348                __m128i* dst_lo,
349                __m128i* dst_hi)
350 {
351     __m128i s_lo, s_hi;
352     __m128i a_lo, a_hi;
353
354     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
355     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
356
357     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
358 }
359
360 /* load 4 pixels from a 16-byte boundary aligned address */
361 static force_inline __m128i
362 load_128_aligned (__m128i* src)
363 {
364     return _mm_load_si128 (src);
365 }
366
367 /* load 4 pixels from a unaligned address */
368 static force_inline __m128i
369 load_128_unaligned (const __m128i* src)
370 {
371     return _mm_loadu_si128 (src);
372 }
373
374 /* save 4 pixels using Write Combining memory on a 16-byte
375  * boundary aligned address
376  */
377 static force_inline void
378 save_128_write_combining (__m128i* dst,
379                           __m128i  data)
380 {
381     _mm_stream_si128 (dst, data);
382 }
383
384 /* save 4 pixels on a 16-byte boundary aligned address */
385 static force_inline void
386 save_128_aligned (__m128i* dst,
387                   __m128i  data)
388 {
389     _mm_store_si128 (dst, data);
390 }
391
392 /* save 4 pixels on a unaligned address */
393 static force_inline void
394 save_128_unaligned (__m128i* dst,
395                     __m128i  data)
396 {
397     _mm_storeu_si128 (dst, data);
398 }
399
400 /* ------------------------------------------------------------------
401  * MMX inlines
402  */
403
404 static force_inline __m64
405 load_32_1x64 (uint32_t data)
406 {
407     return _mm_cvtsi32_si64 (data);
408 }
409
410 static force_inline __m64
411 unpack_32_1x64 (uint32_t data)
412 {
413     return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
414 }
415
416 static force_inline __m64
417 expand_alpha_1x64 (__m64 data)
418 {
419     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
420 }
421
422 static force_inline __m64
423 expand_alpha_rev_1x64 (__m64 data)
424 {
425     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
426 }
427
428 static force_inline __m64
429 expand_pixel_8_1x64 (uint8_t data)
430 {
431     return _mm_shuffle_pi16 (
432         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
433 }
434
435 static force_inline __m64
436 pix_multiply_1x64 (__m64 data,
437                    __m64 alpha)
438 {
439     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
440                                           mask_x0080),
441                            mask_x0101);
442 }
443
444 static force_inline __m64
445 pix_add_multiply_1x64 (__m64* src,
446                        __m64* alpha_dst,
447                        __m64* dst,
448                        __m64* alpha_src)
449 {
450     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
451     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
452
453     return _mm_adds_pu8 (t1, t2);
454 }
455
456 static force_inline __m64
457 negate_1x64 (__m64 data)
458 {
459     return _mm_xor_si64 (data, mask_x00ff);
460 }
461
462 static force_inline __m64
463 invert_colors_1x64 (__m64 data)
464 {
465     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
466 }
467
468 static force_inline __m64
469 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
470 {
471     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
472 }
473
474 static force_inline __m64
475 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
476 {
477     return over_1x64 (pix_multiply_1x64 (*src, *mask),
478                       pix_multiply_1x64 (*alpha, *mask),
479                       *dst);
480 }
481
482 static force_inline __m64
483 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
484 {
485     __m64 alpha = expand_alpha_1x64 (src);
486
487     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
488                                          _mm_or_si64 (alpha, mask_x_alpha)),
489                       alpha,
490                       dst);
491 }
492
493 static force_inline uint32_t
494 pack_1x64_32 (__m64 data)
495 {
496     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
497 }
498
499 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
500  *
501  *    00RR00GG00BB
502  *
503  * --- Expanding 565 in the low word ---
504  *
505  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
506  * m = m & (01f0003f001f);
507  * m = m * (008404100840);
508  * m = m >> 8;
509  *
510  * Note the trick here - the top word is shifted by another nibble to
511  * avoid it bumping into the middle word
512  */
513 static force_inline __m64
514 expand565_16_1x64 (uint16_t pixel)
515 {
516     __m64 p;
517     __m64 t1, t2;
518
519     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
520
521     t1 = _mm_slli_si64 (p, 36 - 11);
522     t2 = _mm_slli_si64 (p, 16 - 5);
523
524     p = _mm_or_si64 (t1, p);
525     p = _mm_or_si64 (t2, p);
526     p = _mm_and_si64 (p, mask_x565_rgb);
527     p = _mm_mullo_pi16 (p, mask_x565_unpack);
528
529     return _mm_srli_pi16 (p, 8);
530 }
531
532 /* ----------------------------------------------------------------------------
533  * Compose Core transformations
534  */
535 static force_inline uint32_t
536 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
537 {
538     uint8_t a;
539     __m64 ms;
540
541     a = src >> 24;
542
543     if (a == 0xff)
544     {
545         return src;
546     }
547     else if (src)
548     {
549         ms = unpack_32_1x64 (src);
550         return pack_1x64_32 (
551             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
552     }
553
554     return dst;
555 }
556
557 static force_inline uint32_t
558 combine1 (const uint32_t *ps, const uint32_t *pm)
559 {
560     uint32_t s = *ps;
561
562     if (pm)
563     {
564         __m64 ms, mm;
565
566         mm = unpack_32_1x64 (*pm);
567         mm = expand_alpha_1x64 (mm);
568
569         ms = unpack_32_1x64 (s);
570         ms = pix_multiply_1x64 (ms, mm);
571
572         s = pack_1x64_32 (ms);
573     }
574
575     return s;
576 }
577
578 static force_inline __m128i
579 combine4 (const __m128i *ps, const __m128i *pm)
580 {
581     __m128i xmm_src_lo, xmm_src_hi;
582     __m128i xmm_msk_lo, xmm_msk_hi;
583     __m128i s;
584
585     if (pm)
586     {
587         xmm_msk_lo = load_128_unaligned (pm);
588
589         if (is_transparent (xmm_msk_lo))
590             return _mm_setzero_si128 ();
591     }
592
593     s = load_128_unaligned (ps);
594
595     if (pm)
596     {
597         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
598         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
599
600         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
601
602         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
603                             &xmm_msk_lo, &xmm_msk_hi,
604                             &xmm_src_lo, &xmm_src_hi);
605
606         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
607     }
608
609     return s;
610 }
611
612 static force_inline void
613 core_combine_over_u_sse2 (uint32_t*       pd,
614                           const uint32_t* ps,
615                           const uint32_t* pm,
616                           int             w)
617 {
618     uint32_t s, d;
619
620     __m128i xmm_dst_lo, xmm_dst_hi;
621     __m128i xmm_src_lo, xmm_src_hi;
622     __m128i xmm_alpha_lo, xmm_alpha_hi;
623
624     /* Align dst on a 16-byte boundary */
625     while (w && ((unsigned long)pd & 15))
626     {
627         d = *pd;
628         s = combine1 (ps, pm);
629
630         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
631         ps++;
632         if (pm)
633             pm++;
634         w--;
635     }
636
637     while (w >= 4)
638     {
639         /* I'm loading unaligned because I'm not sure about
640          * the address alignment.
641          */
642         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
643
644         if (is_opaque (xmm_src_hi))
645         {
646             save_128_aligned ((__m128i*)pd, xmm_src_hi);
647         }
648         else if (!is_zero (xmm_src_hi))
649         {
650             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
651
652             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
653             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
654
655             expand_alpha_2x128 (
656                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
657
658             over_2x128 (&xmm_src_lo, &xmm_src_hi,
659                         &xmm_alpha_lo, &xmm_alpha_hi,
660                         &xmm_dst_lo, &xmm_dst_hi);
661
662             /* rebuid the 4 pixel data and save*/
663             save_128_aligned ((__m128i*)pd,
664                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
665         }
666
667         w -= 4;
668         ps += 4;
669         pd += 4;
670         if (pm)
671             pm += 4;
672     }
673
674     while (w)
675     {
676         d = *pd;
677         s = combine1 (ps, pm);
678
679         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
680         ps++;
681         if (pm)
682             pm++;
683
684         w--;
685     }
686 }
687
688 static force_inline void
689 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
690                                   const uint32_t* ps,
691                                   const uint32_t* pm,
692                                   int             w)
693 {
694     uint32_t s, d;
695
696     __m128i xmm_dst_lo, xmm_dst_hi;
697     __m128i xmm_src_lo, xmm_src_hi;
698     __m128i xmm_alpha_lo, xmm_alpha_hi;
699
700     /* Align dst on a 16-byte boundary */
701     while (w &&
702            ((unsigned long)pd & 15))
703     {
704         d = *pd;
705         s = combine1 (ps, pm);
706
707         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
708         w--;
709         ps++;
710         if (pm)
711             pm++;
712     }
713
714     while (w >= 4)
715     {
716         /* I'm loading unaligned because I'm not sure
717          * about the address alignment.
718          */
719         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
720         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
721
722         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
723         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
724
725         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
726                             &xmm_alpha_lo, &xmm_alpha_hi);
727
728         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
729                     &xmm_alpha_lo, &xmm_alpha_hi,
730                     &xmm_src_lo, &xmm_src_hi);
731
732         /* rebuid the 4 pixel data and save*/
733         save_128_aligned ((__m128i*)pd,
734                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
735
736         w -= 4;
737         ps += 4;
738         pd += 4;
739
740         if (pm)
741             pm += 4;
742     }
743
744     while (w)
745     {
746         d = *pd;
747         s = combine1 (ps, pm);
748
749         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
750         ps++;
751         w--;
752         if (pm)
753             pm++;
754     }
755 }
756
757 static force_inline uint32_t
758 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
759 {
760     uint32_t maska = src >> 24;
761
762     if (maska == 0)
763     {
764         return 0;
765     }
766     else if (maska != 0xff)
767     {
768         return pack_1x64_32 (
769             pix_multiply_1x64 (unpack_32_1x64 (dst),
770                                expand_alpha_1x64 (unpack_32_1x64 (src))));
771     }
772
773     return dst;
774 }
775
776 static force_inline void
777 core_combine_in_u_sse2 (uint32_t*       pd,
778                         const uint32_t* ps,
779                         const uint32_t* pm,
780                         int             w)
781 {
782     uint32_t s, d;
783
784     __m128i xmm_src_lo, xmm_src_hi;
785     __m128i xmm_dst_lo, xmm_dst_hi;
786
787     while (w && ((unsigned long) pd & 15))
788     {
789         s = combine1 (ps, pm);
790         d = *pd;
791
792         *pd++ = core_combine_in_u_pixelsse2 (d, s);
793         w--;
794         ps++;
795         if (pm)
796             pm++;
797     }
798
799     while (w >= 4)
800     {
801         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
802         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
803
804         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
805         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
806
807         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
808         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
809                             &xmm_dst_lo, &xmm_dst_hi,
810                             &xmm_dst_lo, &xmm_dst_hi);
811
812         save_128_aligned ((__m128i*)pd,
813                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
814
815         ps += 4;
816         pd += 4;
817         w -= 4;
818         if (pm)
819             pm += 4;
820     }
821
822     while (w)
823     {
824         s = combine1 (ps, pm);
825         d = *pd;
826
827         *pd++ = core_combine_in_u_pixelsse2 (d, s);
828         w--;
829         ps++;
830         if (pm)
831             pm++;
832     }
833 }
834
835 static force_inline void
836 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
837                                 const uint32_t* ps,
838                                 const uint32_t *pm,
839                                 int             w)
840 {
841     uint32_t s, d;
842
843     __m128i xmm_src_lo, xmm_src_hi;
844     __m128i xmm_dst_lo, xmm_dst_hi;
845
846     while (w && ((unsigned long) pd & 15))
847     {
848         s = combine1 (ps, pm);
849         d = *pd;
850
851         *pd++ = core_combine_in_u_pixelsse2 (s, d);
852         ps++;
853         w--;
854         if (pm)
855             pm++;
856     }
857
858     while (w >= 4)
859     {
860         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
861         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
862
863         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
865
866         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
867         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
868                             &xmm_src_lo, &xmm_src_hi,
869                             &xmm_dst_lo, &xmm_dst_hi);
870
871         save_128_aligned (
872             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
873
874         ps += 4;
875         pd += 4;
876         w -= 4;
877         if (pm)
878             pm += 4;
879     }
880
881     while (w)
882     {
883         s = combine1 (ps, pm);
884         d = *pd;
885
886         *pd++ = core_combine_in_u_pixelsse2 (s, d);
887         w--;
888         ps++;
889         if (pm)
890             pm++;
891     }
892 }
893
894 static force_inline void
895 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
896                                  const uint32_t* ps,
897                                  const uint32_t* pm,
898                                  int             w)
899 {
900     while (w && ((unsigned long) pd & 15))
901     {
902         uint32_t s = combine1 (ps, pm);
903         uint32_t d = *pd;
904
905         *pd++ = pack_1x64_32 (
906             pix_multiply_1x64 (
907                 unpack_32_1x64 (d), negate_1x64 (
908                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
909
910         if (pm)
911             pm++;
912         ps++;
913         w--;
914     }
915
916     while (w >= 4)
917     {
918         __m128i xmm_src_lo, xmm_src_hi;
919         __m128i xmm_dst_lo, xmm_dst_hi;
920
921         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
922         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
923
924         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
925         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
926
927         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
928         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
929
930         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
931                             &xmm_src_lo, &xmm_src_hi,
932                             &xmm_dst_lo, &xmm_dst_hi);
933
934         save_128_aligned (
935             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
936
937         ps += 4;
938         pd += 4;
939         if (pm)
940             pm += 4;
941
942         w -= 4;
943     }
944
945     while (w)
946     {
947         uint32_t s = combine1 (ps, pm);
948         uint32_t d = *pd;
949
950         *pd++ = pack_1x64_32 (
951             pix_multiply_1x64 (
952                 unpack_32_1x64 (d), negate_1x64 (
953                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
954         ps++;
955         if (pm)
956             pm++;
957         w--;
958     }
959 }
960
961 static force_inline void
962 core_combine_out_u_sse2 (uint32_t*       pd,
963                          const uint32_t* ps,
964                          const uint32_t* pm,
965                          int             w)
966 {
967     while (w && ((unsigned long) pd & 15))
968     {
969         uint32_t s = combine1 (ps, pm);
970         uint32_t d = *pd;
971
972         *pd++ = pack_1x64_32 (
973             pix_multiply_1x64 (
974                 unpack_32_1x64 (s), negate_1x64 (
975                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
976         w--;
977         ps++;
978         if (pm)
979             pm++;
980     }
981
982     while (w >= 4)
983     {
984         __m128i xmm_src_lo, xmm_src_hi;
985         __m128i xmm_dst_lo, xmm_dst_hi;
986
987         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
988         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
989
990         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
991         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
992
993         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
994         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
995
996         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
997                             &xmm_dst_lo, &xmm_dst_hi,
998                             &xmm_dst_lo, &xmm_dst_hi);
999
1000         save_128_aligned (
1001             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1002
1003         ps += 4;
1004         pd += 4;
1005         w -= 4;
1006         if (pm)
1007             pm += 4;
1008     }
1009
1010     while (w)
1011     {
1012         uint32_t s = combine1 (ps, pm);
1013         uint32_t d = *pd;
1014
1015         *pd++ = pack_1x64_32 (
1016             pix_multiply_1x64 (
1017                 unpack_32_1x64 (s), negate_1x64 (
1018                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1019         w--;
1020         ps++;
1021         if (pm)
1022             pm++;
1023     }
1024 }
1025
1026 static force_inline uint32_t
1027 core_combine_atop_u_pixel_sse2 (uint32_t src,
1028                                 uint32_t dst)
1029 {
1030     __m64 s = unpack_32_1x64 (src);
1031     __m64 d = unpack_32_1x64 (dst);
1032
1033     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1034     __m64 da = expand_alpha_1x64 (d);
1035
1036     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1037 }
1038
1039 static force_inline void
1040 core_combine_atop_u_sse2 (uint32_t*       pd,
1041                           const uint32_t* ps,
1042                           const uint32_t* pm,
1043                           int             w)
1044 {
1045     uint32_t s, d;
1046
1047     __m128i xmm_src_lo, xmm_src_hi;
1048     __m128i xmm_dst_lo, xmm_dst_hi;
1049     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1050     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1051
1052     while (w && ((unsigned long) pd & 15))
1053     {
1054         s = combine1 (ps, pm);
1055         d = *pd;
1056
1057         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1058         w--;
1059         ps++;
1060         if (pm)
1061             pm++;
1062     }
1063
1064     while (w >= 4)
1065     {
1066         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1067         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1068
1069         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1070         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1071
1072         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1073                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1074         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1075                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1076
1077         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1078                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1079
1080         pix_add_multiply_2x128 (
1081             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1082             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1083             &xmm_dst_lo, &xmm_dst_hi);
1084
1085         save_128_aligned (
1086             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1087
1088         ps += 4;
1089         pd += 4;
1090         w -= 4;
1091         if (pm)
1092             pm += 4;
1093     }
1094
1095     while (w)
1096     {
1097         s = combine1 (ps, pm);
1098         d = *pd;
1099
1100         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1101         w--;
1102         ps++;
1103         if (pm)
1104             pm++;
1105     }
1106 }
1107
1108 static force_inline uint32_t
1109 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1110                                         uint32_t dst)
1111 {
1112     __m64 s = unpack_32_1x64 (src);
1113     __m64 d = unpack_32_1x64 (dst);
1114
1115     __m64 sa = expand_alpha_1x64 (s);
1116     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1117
1118     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1119 }
1120
1121 static force_inline void
1122 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1123                                   const uint32_t* ps,
1124                                   const uint32_t* pm,
1125                                   int             w)
1126 {
1127     uint32_t s, d;
1128
1129     __m128i xmm_src_lo, xmm_src_hi;
1130     __m128i xmm_dst_lo, xmm_dst_hi;
1131     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1132     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1133
1134     while (w && ((unsigned long) pd & 15))
1135     {
1136         s = combine1 (ps, pm);
1137         d = *pd;
1138
1139         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1140         ps++;
1141         w--;
1142         if (pm)
1143             pm++;
1144     }
1145
1146     while (w >= 4)
1147     {
1148         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1149         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1150
1151         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1152         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1153
1154         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1155                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1156         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1157                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1158
1159         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1160                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1161
1162         pix_add_multiply_2x128 (
1163             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1164             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1165             &xmm_dst_lo, &xmm_dst_hi);
1166
1167         save_128_aligned (
1168             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1169
1170         ps += 4;
1171         pd += 4;
1172         w -= 4;
1173         if (pm)
1174             pm += 4;
1175     }
1176
1177     while (w)
1178     {
1179         s = combine1 (ps, pm);
1180         d = *pd;
1181
1182         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1183         ps++;
1184         w--;
1185         if (pm)
1186             pm++;
1187     }
1188 }
1189
1190 static force_inline uint32_t
1191 core_combine_xor_u_pixel_sse2 (uint32_t src,
1192                                uint32_t dst)
1193 {
1194     __m64 s = unpack_32_1x64 (src);
1195     __m64 d = unpack_32_1x64 (dst);
1196
1197     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1198     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1199
1200     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1201 }
1202
1203 static force_inline void
1204 core_combine_xor_u_sse2 (uint32_t*       dst,
1205                          const uint32_t* src,
1206                          const uint32_t *mask,
1207                          int             width)
1208 {
1209     int w = width;
1210     uint32_t s, d;
1211     uint32_t* pd = dst;
1212     const uint32_t* ps = src;
1213     const uint32_t* pm = mask;
1214
1215     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1216     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1217     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1218     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1219
1220     while (w && ((unsigned long) pd & 15))
1221     {
1222         s = combine1 (ps, pm);
1223         d = *pd;
1224
1225         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1226         w--;
1227         ps++;
1228         if (pm)
1229             pm++;
1230     }
1231
1232     while (w >= 4)
1233     {
1234         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1235         xmm_dst = load_128_aligned ((__m128i*) pd);
1236
1237         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1238         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1239
1240         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1241                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1242         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1243                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1244
1245         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1246                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1247         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1248                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1249
1250         pix_add_multiply_2x128 (
1251             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1252             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1253             &xmm_dst_lo, &xmm_dst_hi);
1254
1255         save_128_aligned (
1256             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1257
1258         ps += 4;
1259         pd += 4;
1260         w -= 4;
1261         if (pm)
1262             pm += 4;
1263     }
1264
1265     while (w)
1266     {
1267         s = combine1 (ps, pm);
1268         d = *pd;
1269
1270         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1271         w--;
1272         ps++;
1273         if (pm)
1274             pm++;
1275     }
1276 }
1277
1278 static force_inline void
1279 core_combine_add_u_sse2 (uint32_t*       dst,
1280                          const uint32_t* src,
1281                          const uint32_t* mask,
1282                          int             width)
1283 {
1284     int w = width;
1285     uint32_t s, d;
1286     uint32_t* pd = dst;
1287     const uint32_t* ps = src;
1288     const uint32_t* pm = mask;
1289
1290     while (w && (unsigned long)pd & 15)
1291     {
1292         s = combine1 (ps, pm);
1293         d = *pd;
1294
1295         ps++;
1296         if (pm)
1297             pm++;
1298         *pd++ = _mm_cvtsi64_si32 (
1299             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1300         w--;
1301     }
1302
1303     while (w >= 4)
1304     {
1305         __m128i s;
1306
1307         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1308
1309         save_128_aligned (
1310             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1311
1312         pd += 4;
1313         ps += 4;
1314         if (pm)
1315             pm += 4;
1316         w -= 4;
1317     }
1318
1319     while (w--)
1320     {
1321         s = combine1 (ps, pm);
1322         d = *pd;
1323
1324         ps++;
1325         *pd++ = _mm_cvtsi64_si32 (
1326             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1327         if (pm)
1328             pm++;
1329     }
1330 }
1331
1332 static force_inline uint32_t
1333 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1334                                     uint32_t dst)
1335 {
1336     __m64 ms = unpack_32_1x64 (src);
1337     __m64 md = unpack_32_1x64 (dst);
1338     uint32_t sa = src >> 24;
1339     uint32_t da = ~dst >> 24;
1340
1341     if (sa > da)
1342     {
1343         ms = pix_multiply_1x64 (
1344             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1345     }
1346
1347     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1348 }
1349
1350 static force_inline void
1351 core_combine_saturate_u_sse2 (uint32_t *      pd,
1352                               const uint32_t *ps,
1353                               const uint32_t *pm,
1354                               int             w)
1355 {
1356     uint32_t s, d;
1357
1358     uint32_t pack_cmp;
1359     __m128i xmm_src, xmm_dst;
1360
1361     while (w && (unsigned long)pd & 15)
1362     {
1363         s = combine1 (ps, pm);
1364         d = *pd;
1365
1366         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1367         w--;
1368         ps++;
1369         if (pm)
1370             pm++;
1371     }
1372
1373     while (w >= 4)
1374     {
1375         xmm_dst = load_128_aligned  ((__m128i*)pd);
1376         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1377
1378         pack_cmp = _mm_movemask_epi8 (
1379             _mm_cmpgt_epi32 (
1380                 _mm_srli_epi32 (xmm_src, 24),
1381                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1382
1383         /* if some alpha src is grater than respective ~alpha dst */
1384         if (pack_cmp)
1385         {
1386             s = combine1 (ps++, pm);
1387             d = *pd;
1388             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1389             if (pm)
1390                 pm++;
1391
1392             s = combine1 (ps++, pm);
1393             d = *pd;
1394             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1395             if (pm)
1396                 pm++;
1397
1398             s = combine1 (ps++, pm);
1399             d = *pd;
1400             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1401             if (pm)
1402                 pm++;
1403
1404             s = combine1 (ps++, pm);
1405             d = *pd;
1406             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1407             if (pm)
1408                 pm++;
1409         }
1410         else
1411         {
1412             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1413
1414             pd += 4;
1415             ps += 4;
1416             if (pm)
1417                 pm += 4;
1418         }
1419
1420         w -= 4;
1421     }
1422
1423     while (w--)
1424     {
1425         s = combine1 (ps, pm);
1426         d = *pd;
1427
1428         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1429         ps++;
1430         if (pm)
1431             pm++;
1432     }
1433 }
1434
1435 static force_inline void
1436 core_combine_src_ca_sse2 (uint32_t*       pd,
1437                           const uint32_t* ps,
1438                           const uint32_t *pm,
1439                           int             w)
1440 {
1441     uint32_t s, m;
1442
1443     __m128i xmm_src_lo, xmm_src_hi;
1444     __m128i xmm_mask_lo, xmm_mask_hi;
1445     __m128i xmm_dst_lo, xmm_dst_hi;
1446
1447     while (w && (unsigned long)pd & 15)
1448     {
1449         s = *ps++;
1450         m = *pm++;
1451         *pd++ = pack_1x64_32 (
1452             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1453         w--;
1454     }
1455
1456     while (w >= 4)
1457     {
1458         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1459         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1460
1461         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1462         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1463
1464         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1465                             &xmm_mask_lo, &xmm_mask_hi,
1466                             &xmm_dst_lo, &xmm_dst_hi);
1467
1468         save_128_aligned (
1469             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1470
1471         ps += 4;
1472         pd += 4;
1473         pm += 4;
1474         w -= 4;
1475     }
1476
1477     while (w)
1478     {
1479         s = *ps++;
1480         m = *pm++;
1481         *pd++ = pack_1x64_32 (
1482             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1483         w--;
1484     }
1485 }
1486
1487 static force_inline uint32_t
1488 core_combine_over_ca_pixel_sse2 (uint32_t src,
1489                                  uint32_t mask,
1490                                  uint32_t dst)
1491 {
1492     __m64 s = unpack_32_1x64 (src);
1493     __m64 expAlpha = expand_alpha_1x64 (s);
1494     __m64 unpk_mask = unpack_32_1x64 (mask);
1495     __m64 unpk_dst  = unpack_32_1x64 (dst);
1496
1497     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1498 }
1499
1500 static force_inline void
1501 core_combine_over_ca_sse2 (uint32_t*       pd,
1502                            const uint32_t* ps,
1503                            const uint32_t *pm,
1504                            int             w)
1505 {
1506     uint32_t s, m, d;
1507
1508     __m128i xmm_alpha_lo, xmm_alpha_hi;
1509     __m128i xmm_src_lo, xmm_src_hi;
1510     __m128i xmm_dst_lo, xmm_dst_hi;
1511     __m128i xmm_mask_lo, xmm_mask_hi;
1512
1513     while (w && (unsigned long)pd & 15)
1514     {
1515         s = *ps++;
1516         m = *pm++;
1517         d = *pd;
1518
1519         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1520         w--;
1521     }
1522
1523     while (w >= 4)
1524     {
1525         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1526         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1527         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1528
1529         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1530         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1531         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1532
1533         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1534                             &xmm_alpha_lo, &xmm_alpha_hi);
1535
1536         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1537                        &xmm_alpha_lo, &xmm_alpha_hi,
1538                        &xmm_mask_lo, &xmm_mask_hi,
1539                        &xmm_dst_lo, &xmm_dst_hi);
1540
1541         save_128_aligned (
1542             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1543
1544         ps += 4;
1545         pd += 4;
1546         pm += 4;
1547         w -= 4;
1548     }
1549
1550     while (w)
1551     {
1552         s = *ps++;
1553         m = *pm++;
1554         d = *pd;
1555
1556         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1557         w--;
1558     }
1559 }
1560
1561 static force_inline uint32_t
1562 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1563                                          uint32_t mask,
1564                                          uint32_t dst)
1565 {
1566     __m64 d = unpack_32_1x64 (dst);
1567
1568     return pack_1x64_32 (
1569         over_1x64 (d, expand_alpha_1x64 (d),
1570                    pix_multiply_1x64 (unpack_32_1x64 (src),
1571                                       unpack_32_1x64 (mask))));
1572 }
1573
1574 static force_inline void
1575 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1576                                    const uint32_t* ps,
1577                                    const uint32_t *pm,
1578                                    int             w)
1579 {
1580     uint32_t s, m, d;
1581
1582     __m128i xmm_alpha_lo, xmm_alpha_hi;
1583     __m128i xmm_src_lo, xmm_src_hi;
1584     __m128i xmm_dst_lo, xmm_dst_hi;
1585     __m128i xmm_mask_lo, xmm_mask_hi;
1586
1587     while (w && (unsigned long)pd & 15)
1588     {
1589         s = *ps++;
1590         m = *pm++;
1591         d = *pd;
1592
1593         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1594         w--;
1595     }
1596
1597     while (w >= 4)
1598     {
1599         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1600         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1601         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1602
1603         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1604         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1605         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1606
1607         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1608                             &xmm_alpha_lo, &xmm_alpha_hi);
1609         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1610                             &xmm_mask_lo, &xmm_mask_hi,
1611                             &xmm_mask_lo, &xmm_mask_hi);
1612
1613         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1614                     &xmm_alpha_lo, &xmm_alpha_hi,
1615                     &xmm_mask_lo, &xmm_mask_hi);
1616
1617         save_128_aligned (
1618             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1619
1620         ps += 4;
1621         pd += 4;
1622         pm += 4;
1623         w -= 4;
1624     }
1625
1626     while (w)
1627     {
1628         s = *ps++;
1629         m = *pm++;
1630         d = *pd;
1631
1632         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1633         w--;
1634     }
1635 }
1636
1637 static force_inline void
1638 core_combine_in_ca_sse2 (uint32_t *      pd,
1639                          const uint32_t *ps,
1640                          const uint32_t *pm,
1641                          int             w)
1642 {
1643     uint32_t s, m, d;
1644
1645     __m128i xmm_alpha_lo, xmm_alpha_hi;
1646     __m128i xmm_src_lo, xmm_src_hi;
1647     __m128i xmm_dst_lo, xmm_dst_hi;
1648     __m128i xmm_mask_lo, xmm_mask_hi;
1649
1650     while (w && (unsigned long)pd & 15)
1651     {
1652         s = *ps++;
1653         m = *pm++;
1654         d = *pd;
1655
1656         *pd++ = pack_1x64_32 (
1657             pix_multiply_1x64 (
1658                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1659                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1660
1661         w--;
1662     }
1663
1664     while (w >= 4)
1665     {
1666         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1667         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1668         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1669
1670         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1671         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1672         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1673
1674         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1675                             &xmm_alpha_lo, &xmm_alpha_hi);
1676
1677         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1678                             &xmm_mask_lo, &xmm_mask_hi,
1679                             &xmm_dst_lo, &xmm_dst_hi);
1680
1681         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1682                             &xmm_alpha_lo, &xmm_alpha_hi,
1683                             &xmm_dst_lo, &xmm_dst_hi);
1684
1685         save_128_aligned (
1686             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1687
1688         ps += 4;
1689         pd += 4;
1690         pm += 4;
1691         w -= 4;
1692     }
1693
1694     while (w)
1695     {
1696         s = *ps++;
1697         m = *pm++;
1698         d = *pd;
1699
1700         *pd++ = pack_1x64_32 (
1701             pix_multiply_1x64 (
1702                 pix_multiply_1x64 (
1703                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1704                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1705
1706         w--;
1707     }
1708 }
1709
1710 static force_inline void
1711 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1712                                  const uint32_t *ps,
1713                                  const uint32_t *pm,
1714                                  int             w)
1715 {
1716     uint32_t s, m, d;
1717
1718     __m128i xmm_alpha_lo, xmm_alpha_hi;
1719     __m128i xmm_src_lo, xmm_src_hi;
1720     __m128i xmm_dst_lo, xmm_dst_hi;
1721     __m128i xmm_mask_lo, xmm_mask_hi;
1722
1723     while (w && (unsigned long)pd & 15)
1724     {
1725         s = *ps++;
1726         m = *pm++;
1727         d = *pd;
1728
1729         *pd++ = pack_1x64_32 (
1730             pix_multiply_1x64 (
1731                 unpack_32_1x64 (d),
1732                 pix_multiply_1x64 (unpack_32_1x64 (m),
1733                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1734         w--;
1735     }
1736
1737     while (w >= 4)
1738     {
1739         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1740         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1741         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1742
1743         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1744         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1745         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1746
1747         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1748                             &xmm_alpha_lo, &xmm_alpha_hi);
1749         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1750                             &xmm_alpha_lo, &xmm_alpha_hi,
1751                             &xmm_alpha_lo, &xmm_alpha_hi);
1752
1753         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1754                             &xmm_alpha_lo, &xmm_alpha_hi,
1755                             &xmm_dst_lo, &xmm_dst_hi);
1756
1757         save_128_aligned (
1758             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1759
1760         ps += 4;
1761         pd += 4;
1762         pm += 4;
1763         w -= 4;
1764     }
1765
1766     while (w)
1767     {
1768         s = *ps++;
1769         m = *pm++;
1770         d = *pd;
1771
1772         *pd++ = pack_1x64_32 (
1773             pix_multiply_1x64 (
1774                 unpack_32_1x64 (d),
1775                 pix_multiply_1x64 (unpack_32_1x64 (m),
1776                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1777         w--;
1778     }
1779 }
1780
1781 static force_inline void
1782 core_combine_out_ca_sse2 (uint32_t *      pd,
1783                           const uint32_t *ps,
1784                           const uint32_t *pm,
1785                           int             w)
1786 {
1787     uint32_t s, m, d;
1788
1789     __m128i xmm_alpha_lo, xmm_alpha_hi;
1790     __m128i xmm_src_lo, xmm_src_hi;
1791     __m128i xmm_dst_lo, xmm_dst_hi;
1792     __m128i xmm_mask_lo, xmm_mask_hi;
1793
1794     while (w && (unsigned long)pd & 15)
1795     {
1796         s = *ps++;
1797         m = *pm++;
1798         d = *pd;
1799
1800         *pd++ = pack_1x64_32 (
1801             pix_multiply_1x64 (
1802                 pix_multiply_1x64 (
1803                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1804                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1805         w--;
1806     }
1807
1808     while (w >= 4)
1809     {
1810         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1811         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1812         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1813
1814         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1815         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1816         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1817
1818         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1819                             &xmm_alpha_lo, &xmm_alpha_hi);
1820         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1821                       &xmm_alpha_lo, &xmm_alpha_hi);
1822
1823         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1824                             &xmm_mask_lo, &xmm_mask_hi,
1825                             &xmm_dst_lo, &xmm_dst_hi);
1826         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1827                             &xmm_alpha_lo, &xmm_alpha_hi,
1828                             &xmm_dst_lo, &xmm_dst_hi);
1829
1830         save_128_aligned (
1831             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1832
1833         ps += 4;
1834         pd += 4;
1835         pm += 4;
1836         w -= 4;
1837     }
1838
1839     while (w)
1840     {
1841         s = *ps++;
1842         m = *pm++;
1843         d = *pd;
1844
1845         *pd++ = pack_1x64_32 (
1846             pix_multiply_1x64 (
1847                 pix_multiply_1x64 (
1848                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1849                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1850
1851         w--;
1852     }
1853 }
1854
1855 static force_inline void
1856 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
1857                                   const uint32_t *ps,
1858                                   const uint32_t *pm,
1859                                   int             w)
1860 {
1861     uint32_t s, m, d;
1862
1863     __m128i xmm_alpha_lo, xmm_alpha_hi;
1864     __m128i xmm_src_lo, xmm_src_hi;
1865     __m128i xmm_dst_lo, xmm_dst_hi;
1866     __m128i xmm_mask_lo, xmm_mask_hi;
1867
1868     while (w && (unsigned long)pd & 15)
1869     {
1870         s = *ps++;
1871         m = *pm++;
1872         d = *pd;
1873
1874         *pd++ = pack_1x64_32 (
1875             pix_multiply_1x64 (
1876                 unpack_32_1x64 (d),
1877                 negate_1x64 (pix_multiply_1x64 (
1878                                  unpack_32_1x64 (m),
1879                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
1880         w--;
1881     }
1882
1883     while (w >= 4)
1884     {
1885         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1886         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1887         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1888
1889         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1890         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1891         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1892
1893         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1894                             &xmm_alpha_lo, &xmm_alpha_hi);
1895
1896         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1897                             &xmm_alpha_lo, &xmm_alpha_hi,
1898                             &xmm_mask_lo, &xmm_mask_hi);
1899
1900         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1901                       &xmm_mask_lo, &xmm_mask_hi);
1902
1903         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1904                             &xmm_mask_lo, &xmm_mask_hi,
1905                             &xmm_dst_lo, &xmm_dst_hi);
1906
1907         save_128_aligned (
1908             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1909
1910         ps += 4;
1911         pd += 4;
1912         pm += 4;
1913         w -= 4;
1914     }
1915
1916     while (w)
1917     {
1918         s = *ps++;
1919         m = *pm++;
1920         d = *pd;
1921
1922         *pd++ = pack_1x64_32 (
1923             pix_multiply_1x64 (
1924                 unpack_32_1x64 (d),
1925                 negate_1x64 (pix_multiply_1x64 (
1926                                  unpack_32_1x64 (m),
1927                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
1928         w--;
1929     }
1930 }
1931
1932 static force_inline uint32_t
1933 core_combine_atop_ca_pixel_sse2 (uint32_t src,
1934                                  uint32_t mask,
1935                                  uint32_t dst)
1936 {
1937     __m64 m = unpack_32_1x64 (mask);
1938     __m64 s = unpack_32_1x64 (src);
1939     __m64 d = unpack_32_1x64 (dst);
1940     __m64 sa = expand_alpha_1x64 (s);
1941     __m64 da = expand_alpha_1x64 (d);
1942
1943     s = pix_multiply_1x64 (s, m);
1944     m = negate_1x64 (pix_multiply_1x64 (m, sa));
1945
1946     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
1947 }
1948
1949 static force_inline void
1950 core_combine_atop_ca_sse2 (uint32_t *      pd,
1951                            const uint32_t *ps,
1952                            const uint32_t *pm,
1953                            int             w)
1954 {
1955     uint32_t s, m, d;
1956
1957     __m128i xmm_src_lo, xmm_src_hi;
1958     __m128i xmm_dst_lo, xmm_dst_hi;
1959     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1960     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1961     __m128i xmm_mask_lo, xmm_mask_hi;
1962
1963     while (w && (unsigned long)pd & 15)
1964     {
1965         s = *ps++;
1966         m = *pm++;
1967         d = *pd;
1968
1969         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
1970         w--;
1971     }
1972
1973     while (w >= 4)
1974     {
1975         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1976         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1977         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1978
1979         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1980         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1981         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1982
1983         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1984                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1985         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1986                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1987
1988         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1989                             &xmm_mask_lo, &xmm_mask_hi,
1990                             &xmm_src_lo, &xmm_src_hi);
1991         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1992                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1993                             &xmm_mask_lo, &xmm_mask_hi);
1994
1995         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1996
1997         pix_add_multiply_2x128 (
1998             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
1999             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2000             &xmm_dst_lo, &xmm_dst_hi);
2001
2002         save_128_aligned (
2003             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2004
2005         ps += 4;
2006         pd += 4;
2007         pm += 4;
2008         w -= 4;
2009     }
2010
2011     while (w)
2012     {
2013         s = *ps++;
2014         m = *pm++;
2015         d = *pd;
2016
2017         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2018         w--;
2019     }
2020 }
2021
2022 static force_inline uint32_t
2023 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2024                                          uint32_t mask,
2025                                          uint32_t dst)
2026 {
2027     __m64 m = unpack_32_1x64 (mask);
2028     __m64 s = unpack_32_1x64 (src);
2029     __m64 d = unpack_32_1x64 (dst);
2030
2031     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2032     __m64 sa = expand_alpha_1x64 (s);
2033
2034     s = pix_multiply_1x64 (s, m);
2035     m = pix_multiply_1x64 (m, sa);
2036
2037     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2038 }
2039
2040 static force_inline void
2041 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2042                                    const uint32_t *ps,
2043                                    const uint32_t *pm,
2044                                    int             w)
2045 {
2046     uint32_t s, m, d;
2047
2048     __m128i xmm_src_lo, xmm_src_hi;
2049     __m128i xmm_dst_lo, xmm_dst_hi;
2050     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2051     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2052     __m128i xmm_mask_lo, xmm_mask_hi;
2053
2054     while (w && (unsigned long)pd & 15)
2055     {
2056         s = *ps++;
2057         m = *pm++;
2058         d = *pd;
2059
2060         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2061         w--;
2062     }
2063
2064     while (w >= 4)
2065     {
2066         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2067         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2068         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2069
2070         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2071         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2072         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2073
2074         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2075                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2076         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2077                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2078
2079         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2080                             &xmm_mask_lo, &xmm_mask_hi,
2081                             &xmm_src_lo, &xmm_src_hi);
2082         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2083                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2084                             &xmm_mask_lo, &xmm_mask_hi);
2085
2086         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2087                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2088
2089         pix_add_multiply_2x128 (
2090             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092             &xmm_dst_lo, &xmm_dst_hi);
2093
2094         save_128_aligned (
2095             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2096
2097         ps += 4;
2098         pd += 4;
2099         pm += 4;
2100         w -= 4;
2101     }
2102
2103     while (w)
2104     {
2105         s = *ps++;
2106         m = *pm++;
2107         d = *pd;
2108
2109         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2110         w--;
2111     }
2112 }
2113
2114 static force_inline uint32_t
2115 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2116                                 uint32_t mask,
2117                                 uint32_t dst)
2118 {
2119     __m64 a = unpack_32_1x64 (mask);
2120     __m64 s = unpack_32_1x64 (src);
2121     __m64 d = unpack_32_1x64 (dst);
2122
2123     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2124                                        a, expand_alpha_1x64 (s)));
2125     __m64 dest      = pix_multiply_1x64 (s, a);
2126     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2127
2128     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2129                                                 &alpha_dst,
2130                                                 &dest,
2131                                                 &alpha_src));
2132 }
2133
2134 static force_inline void
2135 core_combine_xor_ca_sse2 (uint32_t *      pd,
2136                           const uint32_t *ps,
2137                           const uint32_t *pm,
2138                           int             w)
2139 {
2140     uint32_t s, m, d;
2141
2142     __m128i xmm_src_lo, xmm_src_hi;
2143     __m128i xmm_dst_lo, xmm_dst_hi;
2144     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2145     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2146     __m128i xmm_mask_lo, xmm_mask_hi;
2147
2148     while (w && (unsigned long)pd & 15)
2149     {
2150         s = *ps++;
2151         m = *pm++;
2152         d = *pd;
2153
2154         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2155         w--;
2156     }
2157
2158     while (w >= 4)
2159     {
2160         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2163
2164         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2167
2168         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2170         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2171                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2172
2173         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2174                             &xmm_mask_lo, &xmm_mask_hi,
2175                             &xmm_src_lo, &xmm_src_hi);
2176         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2177                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2178                             &xmm_mask_lo, &xmm_mask_hi);
2179
2180         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2181                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2183                       &xmm_mask_lo, &xmm_mask_hi);
2184
2185         pix_add_multiply_2x128 (
2186             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2187             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2188             &xmm_dst_lo, &xmm_dst_hi);
2189
2190         save_128_aligned (
2191             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2192
2193         ps += 4;
2194         pd += 4;
2195         pm += 4;
2196         w -= 4;
2197     }
2198
2199     while (w)
2200     {
2201         s = *ps++;
2202         m = *pm++;
2203         d = *pd;
2204
2205         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2206         w--;
2207     }
2208 }
2209
2210 static force_inline void
2211 core_combine_add_ca_sse2 (uint32_t *      pd,
2212                           const uint32_t *ps,
2213                           const uint32_t *pm,
2214                           int             w)
2215 {
2216     uint32_t s, m, d;
2217
2218     __m128i xmm_src_lo, xmm_src_hi;
2219     __m128i xmm_dst_lo, xmm_dst_hi;
2220     __m128i xmm_mask_lo, xmm_mask_hi;
2221
2222     while (w && (unsigned long)pd & 15)
2223     {
2224         s = *ps++;
2225         m = *pm++;
2226         d = *pd;
2227
2228         *pd++ = pack_1x64_32 (
2229             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2230                                              unpack_32_1x64 (m)),
2231                           unpack_32_1x64 (d)));
2232         w--;
2233     }
2234
2235     while (w >= 4)
2236     {
2237         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2238         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2239         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2240
2241         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2242         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2243         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2244
2245         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2246                             &xmm_mask_lo, &xmm_mask_hi,
2247                             &xmm_src_lo, &xmm_src_hi);
2248
2249         save_128_aligned (
2250             (__m128i*)pd, pack_2x128_128 (
2251                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2252                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2253
2254         ps += 4;
2255         pd += 4;
2256         pm += 4;
2257         w -= 4;
2258     }
2259
2260     while (w)
2261     {
2262         s = *ps++;
2263         m = *pm++;
2264         d = *pd;
2265
2266         *pd++ = pack_1x64_32 (
2267             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2268                                              unpack_32_1x64 (m)),
2269                           unpack_32_1x64 (d)));
2270         w--;
2271     }
2272 }
2273
2274 /* ---------------------------------------------------
2275  * fb_compose_setup_sSE2
2276  */
2277 static force_inline __m64
2278 create_mask_16_64 (uint16_t mask)
2279 {
2280     return _mm_set1_pi16 (mask);
2281 }
2282
2283 static force_inline __m128i
2284 create_mask_16_128 (uint16_t mask)
2285 {
2286     return _mm_set1_epi16 (mask);
2287 }
2288
2289 static force_inline __m64
2290 create_mask_2x32_64 (uint32_t mask0,
2291                      uint32_t mask1)
2292 {
2293     return _mm_set_pi32 (mask0, mask1);
2294 }
2295
2296 /* Work around a code generation bug in Sun Studio 12. */
2297 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2298 # define create_mask_2x32_128(mask0, mask1)                             \
2299     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2300 #else
2301 static force_inline __m128i
2302 create_mask_2x32_128 (uint32_t mask0,
2303                       uint32_t mask1)
2304 {
2305     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2306 }
2307 #endif
2308
2309 /* SSE2 code patch for fbcompose.c */
2310
2311 static void
2312 sse2_combine_over_u (pixman_implementation_t *imp,
2313                      pixman_op_t              op,
2314                      uint32_t *               dst,
2315                      const uint32_t *         src,
2316                      const uint32_t *         mask,
2317                      int                      width)
2318 {
2319     core_combine_over_u_sse2 (dst, src, mask, width);
2320     _mm_empty ();
2321 }
2322
2323 static void
2324 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2325                              pixman_op_t              op,
2326                              uint32_t *               dst,
2327                              const uint32_t *         src,
2328                              const uint32_t *         mask,
2329                              int                      width)
2330 {
2331     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2332     _mm_empty ();
2333 }
2334
2335 static void
2336 sse2_combine_in_u (pixman_implementation_t *imp,
2337                    pixman_op_t              op,
2338                    uint32_t *               dst,
2339                    const uint32_t *         src,
2340                    const uint32_t *         mask,
2341                    int                      width)
2342 {
2343     core_combine_in_u_sse2 (dst, src, mask, width);
2344     _mm_empty ();
2345 }
2346
2347 static void
2348 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2349                            pixman_op_t              op,
2350                            uint32_t *               dst,
2351                            const uint32_t *         src,
2352                            const uint32_t *         mask,
2353                            int                      width)
2354 {
2355     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2356     _mm_empty ();
2357 }
2358
2359 static void
2360 sse2_combine_out_u (pixman_implementation_t *imp,
2361                     pixman_op_t              op,
2362                     uint32_t *               dst,
2363                     const uint32_t *         src,
2364                     const uint32_t *         mask,
2365                     int                      width)
2366 {
2367     core_combine_out_u_sse2 (dst, src, mask, width);
2368     _mm_empty ();
2369 }
2370
2371 static void
2372 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2373                             pixman_op_t              op,
2374                             uint32_t *               dst,
2375                             const uint32_t *         src,
2376                             const uint32_t *         mask,
2377                             int                      width)
2378 {
2379     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2380     _mm_empty ();
2381 }
2382
2383 static void
2384 sse2_combine_atop_u (pixman_implementation_t *imp,
2385                      pixman_op_t              op,
2386                      uint32_t *               dst,
2387                      const uint32_t *         src,
2388                      const uint32_t *         mask,
2389                      int                      width)
2390 {
2391     core_combine_atop_u_sse2 (dst, src, mask, width);
2392     _mm_empty ();
2393 }
2394
2395 static void
2396 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2397                              pixman_op_t              op,
2398                              uint32_t *               dst,
2399                              const uint32_t *         src,
2400                              const uint32_t *         mask,
2401                              int                      width)
2402 {
2403     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2404     _mm_empty ();
2405 }
2406
2407 static void
2408 sse2_combine_xor_u (pixman_implementation_t *imp,
2409                     pixman_op_t              op,
2410                     uint32_t *               dst,
2411                     const uint32_t *         src,
2412                     const uint32_t *         mask,
2413                     int                      width)
2414 {
2415     core_combine_xor_u_sse2 (dst, src, mask, width);
2416     _mm_empty ();
2417 }
2418
2419 static void
2420 sse2_combine_add_u (pixman_implementation_t *imp,
2421                     pixman_op_t              op,
2422                     uint32_t *               dst,
2423                     const uint32_t *         src,
2424                     const uint32_t *         mask,
2425                     int                      width)
2426 {
2427     core_combine_add_u_sse2 (dst, src, mask, width);
2428     _mm_empty ();
2429 }
2430
2431 static void
2432 sse2_combine_saturate_u (pixman_implementation_t *imp,
2433                          pixman_op_t              op,
2434                          uint32_t *               dst,
2435                          const uint32_t *         src,
2436                          const uint32_t *         mask,
2437                          int                      width)
2438 {
2439     core_combine_saturate_u_sse2 (dst, src, mask, width);
2440     _mm_empty ();
2441 }
2442
2443 static void
2444 sse2_combine_src_ca (pixman_implementation_t *imp,
2445                      pixman_op_t              op,
2446                      uint32_t *               dst,
2447                      const uint32_t *         src,
2448                      const uint32_t *         mask,
2449                      int                      width)
2450 {
2451     core_combine_src_ca_sse2 (dst, src, mask, width);
2452     _mm_empty ();
2453 }
2454
2455 static void
2456 sse2_combine_over_ca (pixman_implementation_t *imp,
2457                       pixman_op_t              op,
2458                       uint32_t *               dst,
2459                       const uint32_t *         src,
2460                       const uint32_t *         mask,
2461                       int                      width)
2462 {
2463     core_combine_over_ca_sse2 (dst, src, mask, width);
2464     _mm_empty ();
2465 }
2466
2467 static void
2468 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2469                               pixman_op_t              op,
2470                               uint32_t *               dst,
2471                               const uint32_t *         src,
2472                               const uint32_t *         mask,
2473                               int                      width)
2474 {
2475     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2476     _mm_empty ();
2477 }
2478
2479 static void
2480 sse2_combine_in_ca (pixman_implementation_t *imp,
2481                     pixman_op_t              op,
2482                     uint32_t *               dst,
2483                     const uint32_t *         src,
2484                     const uint32_t *         mask,
2485                     int                      width)
2486 {
2487     core_combine_in_ca_sse2 (dst, src, mask, width);
2488     _mm_empty ();
2489 }
2490
2491 static void
2492 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2493                             pixman_op_t              op,
2494                             uint32_t *               dst,
2495                             const uint32_t *         src,
2496                             const uint32_t *         mask,
2497                             int                      width)
2498 {
2499     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2500     _mm_empty ();
2501 }
2502
2503 static void
2504 sse2_combine_out_ca (pixman_implementation_t *imp,
2505                      pixman_op_t              op,
2506                      uint32_t *               dst,
2507                      const uint32_t *         src,
2508                      const uint32_t *         mask,
2509                      int                      width)
2510 {
2511     core_combine_out_ca_sse2 (dst, src, mask, width);
2512     _mm_empty ();
2513 }
2514
2515 static void
2516 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2517                              pixman_op_t              op,
2518                              uint32_t *               dst,
2519                              const uint32_t *         src,
2520                              const uint32_t *         mask,
2521                              int                      width)
2522 {
2523     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2524     _mm_empty ();
2525 }
2526
2527 static void
2528 sse2_combine_atop_ca (pixman_implementation_t *imp,
2529                       pixman_op_t              op,
2530                       uint32_t *               dst,
2531                       const uint32_t *         src,
2532                       const uint32_t *         mask,
2533                       int                      width)
2534 {
2535     core_combine_atop_ca_sse2 (dst, src, mask, width);
2536     _mm_empty ();
2537 }
2538
2539 static void
2540 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2541                               pixman_op_t              op,
2542                               uint32_t *               dst,
2543                               const uint32_t *         src,
2544                               const uint32_t *         mask,
2545                               int                      width)
2546 {
2547     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2548     _mm_empty ();
2549 }
2550
2551 static void
2552 sse2_combine_xor_ca (pixman_implementation_t *imp,
2553                      pixman_op_t              op,
2554                      uint32_t *               dst,
2555                      const uint32_t *         src,
2556                      const uint32_t *         mask,
2557                      int                      width)
2558 {
2559     core_combine_xor_ca_sse2 (dst, src, mask, width);
2560     _mm_empty ();
2561 }
2562
2563 static void
2564 sse2_combine_add_ca (pixman_implementation_t *imp,
2565                      pixman_op_t              op,
2566                      uint32_t *               dst,
2567                      const uint32_t *         src,
2568                      const uint32_t *         mask,
2569                      int                      width)
2570 {
2571     core_combine_add_ca_sse2 (dst, src, mask, width);
2572     _mm_empty ();
2573 }
2574
2575 /* -------------------------------------------------------------------
2576  * composite_over_n_8888
2577  */
2578
2579 static void
2580 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2581                             pixman_op_t              op,
2582                             pixman_image_t *         src_image,
2583                             pixman_image_t *         mask_image,
2584                             pixman_image_t *         dst_image,
2585                             int32_t                  src_x,
2586                             int32_t                  src_y,
2587                             int32_t                  mask_x,
2588                             int32_t                  mask_y,
2589                             int32_t                  dest_x,
2590                             int32_t                  dest_y,
2591                             int32_t                  width,
2592                             int32_t                  height)
2593 {
2594     uint32_t src;
2595     uint32_t    *dst_line, *dst, d;
2596     int32_t w;
2597     int dst_stride;
2598     __m128i xmm_src, xmm_alpha;
2599     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2600
2601     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2602
2603     if (src == 0)
2604         return;
2605
2606     PIXMAN_IMAGE_GET_LINE (
2607         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2608
2609     xmm_src = expand_pixel_32_1x128 (src);
2610     xmm_alpha = expand_alpha_1x128 (xmm_src);
2611
2612     while (height--)
2613     {
2614         dst = dst_line;
2615
2616         dst_line += dst_stride;
2617         w = width;
2618
2619         while (w && (unsigned long)dst & 15)
2620         {
2621             d = *dst;
2622             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2623                                               _mm_movepi64_pi64 (xmm_alpha),
2624                                               unpack_32_1x64 (d)));
2625             w--;
2626         }
2627
2628         while (w >= 4)
2629         {
2630             xmm_dst = load_128_aligned ((__m128i*)dst);
2631
2632             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2633
2634             over_2x128 (&xmm_src, &xmm_src,
2635                         &xmm_alpha, &xmm_alpha,
2636                         &xmm_dst_lo, &xmm_dst_hi);
2637
2638             /* rebuid the 4 pixel data and save*/
2639             save_128_aligned (
2640                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2641
2642             w -= 4;
2643             dst += 4;
2644         }
2645
2646         while (w)
2647         {
2648             d = *dst;
2649             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2650                                               _mm_movepi64_pi64 (xmm_alpha),
2651                                               unpack_32_1x64 (d)));
2652             w--;
2653         }
2654
2655     }
2656     _mm_empty ();
2657 }
2658
2659 /* ---------------------------------------------------------------------
2660  * composite_over_n_0565
2661  */
2662 static void
2663 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2664                             pixman_op_t              op,
2665                             pixman_image_t *         src_image,
2666                             pixman_image_t *         mask_image,
2667                             pixman_image_t *         dst_image,
2668                             int32_t                  src_x,
2669                             int32_t                  src_y,
2670                             int32_t                  mask_x,
2671                             int32_t                  mask_y,
2672                             int32_t                  dest_x,
2673                             int32_t                  dest_y,
2674                             int32_t                  width,
2675                             int32_t                  height)
2676 {
2677     uint32_t src;
2678     uint16_t    *dst_line, *dst, d;
2679     int32_t w;
2680     int dst_stride;
2681     __m128i xmm_src, xmm_alpha;
2682     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2683
2684     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2685
2686     if (src == 0)
2687         return;
2688
2689     PIXMAN_IMAGE_GET_LINE (
2690         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2691
2692     xmm_src = expand_pixel_32_1x128 (src);
2693     xmm_alpha = expand_alpha_1x128 (xmm_src);
2694
2695     while (height--)
2696     {
2697         dst = dst_line;
2698
2699         dst_line += dst_stride;
2700         w = width;
2701
2702         while (w && (unsigned long)dst & 15)
2703         {
2704             d = *dst;
2705
2706             *dst++ = pack_565_32_16 (
2707                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2708                                          _mm_movepi64_pi64 (xmm_alpha),
2709                                          expand565_16_1x64 (d))));
2710             w--;
2711         }
2712
2713         while (w >= 8)
2714         {
2715             xmm_dst = load_128_aligned ((__m128i*)dst);
2716
2717             unpack_565_128_4x128 (xmm_dst,
2718                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2719
2720             over_2x128 (&xmm_src, &xmm_src,
2721                         &xmm_alpha, &xmm_alpha,
2722                         &xmm_dst0, &xmm_dst1);
2723             over_2x128 (&xmm_src, &xmm_src,
2724                         &xmm_alpha, &xmm_alpha,
2725                         &xmm_dst2, &xmm_dst3);
2726
2727             xmm_dst = pack_565_4x128_128 (
2728                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2729
2730             save_128_aligned ((__m128i*)dst, xmm_dst);
2731
2732             dst += 8;
2733             w -= 8;
2734         }
2735
2736         while (w--)
2737         {
2738             d = *dst;
2739             *dst++ = pack_565_32_16 (
2740                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2741                                          _mm_movepi64_pi64 (xmm_alpha),
2742                                          expand565_16_1x64 (d))));
2743         }
2744     }
2745
2746     _mm_empty ();
2747 }
2748
2749 /* ------------------------------
2750  * composite_add_n_8888_8888_ca
2751  */
2752 static void
2753 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2754                                    pixman_op_t              op,
2755                                    pixman_image_t *         src_image,
2756                                    pixman_image_t *         mask_image,
2757                                    pixman_image_t *         dst_image,
2758                                    int32_t                  src_x,
2759                                    int32_t                  src_y,
2760                                    int32_t                  mask_x,
2761                                    int32_t                  mask_y,
2762                                    int32_t                  dest_x,
2763                                    int32_t                  dest_y,
2764                                    int32_t                  width,
2765                                    int32_t                  height)
2766 {
2767     uint32_t src, srca;
2768     uint32_t    *dst_line, d;
2769     uint32_t    *mask_line, m;
2770     uint32_t pack_cmp;
2771     int dst_stride, mask_stride;
2772
2773     __m128i xmm_src, xmm_alpha;
2774     __m128i xmm_dst;
2775     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2776
2777     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2778
2779     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2780     srca = src >> 24;
2781
2782     if (src == 0)
2783         return;
2784
2785     PIXMAN_IMAGE_GET_LINE (
2786         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2787     PIXMAN_IMAGE_GET_LINE (
2788         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2789
2790     xmm_src = _mm_unpacklo_epi8 (
2791         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2792     xmm_alpha = expand_alpha_1x128 (xmm_src);
2793     mmx_src   = _mm_movepi64_pi64 (xmm_src);
2794     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2795
2796     while (height--)
2797     {
2798         int w = width;
2799         const uint32_t *pm = (uint32_t *)mask_line;
2800         uint32_t *pd = (uint32_t *)dst_line;
2801
2802         dst_line += dst_stride;
2803         mask_line += mask_stride;
2804
2805         while (w && (unsigned long)pd & 15)
2806         {
2807             m = *pm++;
2808
2809             if (m)
2810             {
2811                 d = *pd;
2812
2813                 mmx_mask = unpack_32_1x64 (m);
2814                 mmx_dest = unpack_32_1x64 (d);
2815
2816                 *pd = pack_1x64_32 (
2817                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2818             }
2819
2820             pd++;
2821             w--;
2822         }
2823
2824         while (w >= 4)
2825         {
2826             xmm_mask = load_128_unaligned ((__m128i*)pm);
2827
2828             pack_cmp =
2829                 _mm_movemask_epi8 (
2830                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2831
2832             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2833             if (pack_cmp != 0xffff)
2834             {
2835                 xmm_dst = load_128_aligned ((__m128i*)pd);
2836
2837                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2838
2839                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2840                                     &xmm_mask_lo, &xmm_mask_hi,
2841                                     &xmm_mask_lo, &xmm_mask_hi);
2842                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2843
2844                 save_128_aligned (
2845                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2846             }
2847
2848             pd += 4;
2849             pm += 4;
2850             w -= 4;
2851         }
2852
2853         while (w)
2854         {
2855             m = *pm++;
2856
2857             if (m)
2858             {
2859                 d = *pd;
2860
2861                 mmx_mask = unpack_32_1x64 (m);
2862                 mmx_dest = unpack_32_1x64 (d);
2863
2864                 *pd = pack_1x64_32 (
2865                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2866             }
2867
2868             pd++;
2869             w--;
2870         }
2871     }
2872
2873     _mm_empty ();
2874 }
2875
2876 /* ---------------------------------------------------------------------------
2877  * composite_over_n_8888_8888_ca
2878  */
2879
2880 static void
2881 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2882                                     pixman_op_t              op,
2883                                     pixman_image_t *         src_image,
2884                                     pixman_image_t *         mask_image,
2885                                     pixman_image_t *         dst_image,
2886                                     int32_t                  src_x,
2887                                     int32_t                  src_y,
2888                                     int32_t                  mask_x,
2889                                     int32_t                  mask_y,
2890                                     int32_t                  dest_x,
2891                                     int32_t                  dest_y,
2892                                     int32_t                  width,
2893                                     int32_t                  height)
2894 {
2895     uint32_t src;
2896     uint32_t    *dst_line, d;
2897     uint32_t    *mask_line, m;
2898     uint32_t pack_cmp;
2899     int dst_stride, mask_stride;
2900
2901     __m128i xmm_src, xmm_alpha;
2902     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2903     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2904
2905     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2906
2907     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2908
2909     if (src == 0)
2910         return;
2911
2912     PIXMAN_IMAGE_GET_LINE (
2913         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2914     PIXMAN_IMAGE_GET_LINE (
2915         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2916
2917     xmm_src = _mm_unpacklo_epi8 (
2918         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2919     xmm_alpha = expand_alpha_1x128 (xmm_src);
2920     mmx_src   = _mm_movepi64_pi64 (xmm_src);
2921     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2922
2923     while (height--)
2924     {
2925         int w = width;
2926         const uint32_t *pm = (uint32_t *)mask_line;
2927         uint32_t *pd = (uint32_t *)dst_line;
2928
2929         dst_line += dst_stride;
2930         mask_line += mask_stride;
2931
2932         while (w && (unsigned long)pd & 15)
2933         {
2934             m = *pm++;
2935
2936             if (m)
2937             {
2938                 d = *pd;
2939                 mmx_mask = unpack_32_1x64 (m);
2940                 mmx_dest = unpack_32_1x64 (d);
2941
2942                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
2943                                                   &mmx_alpha,
2944                                                   &mmx_mask,
2945                                                   &mmx_dest));
2946             }
2947
2948             pd++;
2949             w--;
2950         }
2951
2952         while (w >= 4)
2953         {
2954             xmm_mask = load_128_unaligned ((__m128i*)pm);
2955
2956             pack_cmp =
2957                 _mm_movemask_epi8 (
2958                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2959
2960             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2961             if (pack_cmp != 0xffff)
2962             {
2963                 xmm_dst = load_128_aligned ((__m128i*)pd);
2964
2965                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2966                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2967
2968                 in_over_2x128 (&xmm_src, &xmm_src,
2969                                &xmm_alpha, &xmm_alpha,
2970                                &xmm_mask_lo, &xmm_mask_hi,
2971                                &xmm_dst_lo, &xmm_dst_hi);
2972
2973                 save_128_aligned (
2974                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2975             }
2976
2977             pd += 4;
2978             pm += 4;
2979             w -= 4;
2980         }
2981
2982         while (w)
2983         {
2984             m = *pm++;
2985
2986             if (m)
2987             {
2988                 d = *pd;
2989                 mmx_mask = unpack_32_1x64 (m);
2990                 mmx_dest = unpack_32_1x64 (d);
2991
2992                 *pd = pack_1x64_32 (
2993                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2994             }
2995
2996             pd++;
2997             w--;
2998         }
2999     }
3000
3001     _mm_empty ();
3002 }
3003
3004 /*---------------------------------------------------------------------
3005  * composite_over_8888_n_8888
3006  */
3007
3008 static void
3009 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3010                                  pixman_op_t              op,
3011                                  pixman_image_t *         src_image,
3012                                  pixman_image_t *         mask_image,
3013                                  pixman_image_t *         dst_image,
3014                                  int32_t                  src_x,
3015                                  int32_t                  src_y,
3016                                  int32_t                  mask_x,
3017                                  int32_t                  mask_y,
3018                                  int32_t                  dest_x,
3019                                  int32_t                  dest_y,
3020                                  int32_t                  width,
3021                                  int32_t                  height)
3022 {
3023     uint32_t    *dst_line, *dst;
3024     uint32_t    *src_line, *src;
3025     uint32_t mask;
3026     int32_t w;
3027     int dst_stride, src_stride;
3028
3029     __m128i xmm_mask;
3030     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3031     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3032     __m128i xmm_alpha_lo, xmm_alpha_hi;
3033
3034     PIXMAN_IMAGE_GET_LINE (
3035         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3036     PIXMAN_IMAGE_GET_LINE (
3037         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3038
3039     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3040
3041     xmm_mask = create_mask_16_128 (mask >> 24);
3042
3043     while (height--)
3044     {
3045         dst = dst_line;
3046         dst_line += dst_stride;
3047         src = src_line;
3048         src_line += src_stride;
3049         w = width;
3050
3051         while (w && (unsigned long)dst & 15)
3052         {
3053             uint32_t s = *src++;
3054
3055             if (s)
3056             {
3057                 uint32_t d = *dst;
3058                 
3059                 __m64 ms = unpack_32_1x64 (s);
3060                 __m64 alpha    = expand_alpha_1x64 (ms);
3061                 __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3062                 __m64 alpha_dst = unpack_32_1x64 (d);
3063                 
3064                 *dst = pack_1x64_32 (
3065                     in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3066             }
3067             dst++;
3068             w--;
3069         }
3070
3071         while (w >= 4)
3072         {
3073             xmm_src = load_128_unaligned ((__m128i*)src);
3074
3075             if (!is_zero (xmm_src))
3076             {
3077                 xmm_dst = load_128_aligned ((__m128i*)dst);
3078                 
3079                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3080                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3081                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3082                                     &xmm_alpha_lo, &xmm_alpha_hi);
3083                 
3084                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3085                                &xmm_alpha_lo, &xmm_alpha_hi,
3086                                &xmm_mask, &xmm_mask,
3087                                &xmm_dst_lo, &xmm_dst_hi);
3088                 
3089                 save_128_aligned (
3090                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3091             }
3092                 
3093             dst += 4;
3094             src += 4;
3095             w -= 4;
3096         }
3097
3098         while (w)
3099         {
3100             uint32_t s = *src++;
3101
3102             if (s)
3103             {
3104                 uint32_t d = *dst;
3105                 
3106                 __m64 ms = unpack_32_1x64 (s);
3107                 __m64 alpha = expand_alpha_1x64 (ms);
3108                 __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3109                 __m64 dest  = unpack_32_1x64 (d);
3110                 
3111                 *dst = pack_1x64_32 (
3112                     in_over_1x64 (&ms, &alpha, &mask, &dest));
3113             }
3114
3115             dst++;
3116             w--;
3117         }
3118     }
3119
3120     _mm_empty ();
3121 }
3122
3123 /*---------------------------------------------------------------------
3124  * composite_over_8888_n_8888
3125  */
3126
3127 static void
3128 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
3129                               pixman_op_t              op,
3130                               pixman_image_t *         src_image,
3131                               pixman_image_t *         mask_image,
3132                               pixman_image_t *         dst_image,
3133                               int32_t                  src_x,
3134                               int32_t                  src_y,
3135                               int32_t                  mask_x,
3136                               int32_t                  mask_y,
3137                               int32_t                  dest_x,
3138                               int32_t                  dest_y,
3139                               int32_t                  width,
3140                               int32_t                  height)
3141 {
3142     uint32_t    *dst_line, *dst;
3143     uint32_t    *src_line, *src;
3144     int32_t w;
3145     int dst_stride, src_stride;
3146
3147
3148     PIXMAN_IMAGE_GET_LINE (
3149         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3150     PIXMAN_IMAGE_GET_LINE (
3151         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3152
3153     while (height--)
3154     {
3155         dst = dst_line;
3156         dst_line += dst_stride;
3157         src = src_line;
3158         src_line += src_stride;
3159         w = width;
3160
3161         while (w && (unsigned long)dst & 15)
3162         {
3163             *dst++ = *src++ | 0xff000000;
3164             w--;
3165         }
3166
3167         while (w >= 16)
3168         {
3169             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
3170             
3171             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
3172             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
3173             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
3174             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
3175             
3176             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
3177             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
3178             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
3179             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
3180             
3181             dst += 16;
3182             src += 16;
3183             w -= 16;
3184         }
3185
3186         while (w)
3187         {
3188             *dst++ = *src++ | 0xff000000;
3189             w--;
3190         }
3191     }
3192
3193     _mm_empty ();
3194 }
3195
3196 /* ---------------------------------------------------------------------
3197  * composite_over_x888_n_8888
3198  */
3199 static void
3200 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3201                                  pixman_op_t              op,
3202                                  pixman_image_t *         src_image,
3203                                  pixman_image_t *         mask_image,
3204                                  pixman_image_t *         dst_image,
3205                                  int32_t                  src_x,
3206                                  int32_t                  src_y,
3207                                  int32_t                  mask_x,
3208                                  int32_t                  mask_y,
3209                                  int32_t                  dest_x,
3210                                  int32_t                  dest_y,
3211                                  int32_t                  width,
3212                                  int32_t                  height)
3213 {
3214     uint32_t    *dst_line, *dst;
3215     uint32_t    *src_line, *src;
3216     uint32_t mask;
3217     int dst_stride, src_stride;
3218     int32_t w;
3219
3220     __m128i xmm_mask, xmm_alpha;
3221     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3222     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3223
3224     PIXMAN_IMAGE_GET_LINE (
3225         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3226     PIXMAN_IMAGE_GET_LINE (
3227         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3228
3229     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3230
3231     xmm_mask = create_mask_16_128 (mask >> 24);
3232     xmm_alpha = mask_00ff;
3233
3234     while (height--)
3235     {
3236         dst = dst_line;
3237         dst_line += dst_stride;
3238         src = src_line;
3239         src_line += src_stride;
3240         w = width;
3241
3242         while (w && (unsigned long)dst & 15)
3243         {
3244             uint32_t s = (*src++) | 0xff000000;
3245             uint32_t d = *dst;
3246
3247             __m64 src   = unpack_32_1x64 (s);
3248             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3249             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3250             __m64 dest  = unpack_32_1x64 (d);
3251
3252             *dst++ = pack_1x64_32 (
3253                 in_over_1x64 (&src, &alpha, &mask, &dest));
3254
3255             w--;
3256         }
3257
3258         while (w >= 4)
3259         {
3260             xmm_src = _mm_or_si128 (
3261                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3262             xmm_dst = load_128_aligned ((__m128i*)dst);
3263
3264             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3265             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3266
3267             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3268                            &xmm_alpha, &xmm_alpha,
3269                            &xmm_mask, &xmm_mask,
3270                            &xmm_dst_lo, &xmm_dst_hi);
3271
3272             save_128_aligned (
3273                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3274
3275             dst += 4;
3276             src += 4;
3277             w -= 4;
3278
3279         }
3280
3281         while (w)
3282         {
3283             uint32_t s = (*src++) | 0xff000000;
3284             uint32_t d = *dst;
3285
3286             __m64 src  = unpack_32_1x64 (s);
3287             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3288             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3289             __m64 dest  = unpack_32_1x64 (d);
3290
3291             *dst++ = pack_1x64_32 (
3292                 in_over_1x64 (&src, &alpha, &mask, &dest));
3293
3294             w--;
3295         }
3296     }
3297
3298     _mm_empty ();
3299 }
3300
3301 /* --------------------------------------------------------------------
3302  * composite_over_8888_8888
3303  */
3304 static void
3305 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3306                                pixman_op_t              op,
3307                                pixman_image_t *         src_image,
3308                                pixman_image_t *         mask_image,
3309                                pixman_image_t *         dst_image,
3310                                int32_t                  src_x,
3311                                int32_t                  src_y,
3312                                int32_t                  mask_x,
3313                                int32_t                  mask_y,
3314                                int32_t                  dest_x,
3315                                int32_t                  dest_y,
3316                                int32_t                  width,
3317                                int32_t                  height)
3318 {
3319     int dst_stride, src_stride;
3320     uint32_t    *dst_line, *dst;
3321     uint32_t    *src_line, *src;
3322
3323     PIXMAN_IMAGE_GET_LINE (
3324         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3325     PIXMAN_IMAGE_GET_LINE (
3326         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3327
3328     dst = dst_line;
3329     src = src_line;
3330
3331     while (height--)
3332     {
3333         core_combine_over_u_sse2 (dst, src, NULL, width);
3334
3335         dst += dst_stride;
3336         src += src_stride;
3337     }
3338     _mm_empty ();
3339 }
3340
3341 /* ------------------------------------------------------------------
3342  * composite_over_8888_0565
3343  */
3344 static force_inline uint16_t
3345 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3346 {
3347     __m64 ms;
3348
3349     ms = unpack_32_1x64 (src);
3350     return pack_565_32_16 (
3351         pack_1x64_32 (
3352             over_1x64 (
3353                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3354 }
3355
3356 static void
3357 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3358                                pixman_op_t              op,
3359                                pixman_image_t *         src_image,
3360                                pixman_image_t *         mask_image,
3361                                pixman_image_t *         dst_image,
3362                                int32_t                  src_x,
3363                                int32_t                  src_y,
3364                                int32_t                  mask_x,
3365                                int32_t                  mask_y,
3366                                int32_t                  dest_x,
3367                                int32_t                  dest_y,
3368                                int32_t                  width,
3369                                int32_t                  height)
3370 {
3371     uint16_t    *dst_line, *dst, d;
3372     uint32_t    *src_line, *src, s;
3373     int dst_stride, src_stride;
3374     int32_t w;
3375
3376     __m128i xmm_alpha_lo, xmm_alpha_hi;
3377     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3378     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3379
3380     PIXMAN_IMAGE_GET_LINE (
3381         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3382     PIXMAN_IMAGE_GET_LINE (
3383         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3384
3385 #if 0
3386     /* FIXME
3387      *
3388      * I copy the code from MMX one and keep the fixme.
3389      * If it's a problem there, probably is a problem here.
3390      */
3391     assert (src_image->drawable == mask_image->drawable);
3392 #endif
3393
3394     while (height--)
3395     {
3396         dst = dst_line;
3397         src = src_line;
3398
3399         dst_line += dst_stride;
3400         src_line += src_stride;
3401         w = width;
3402
3403         /* Align dst on a 16-byte boundary */
3404         while (w &&
3405                ((unsigned long)dst & 15))
3406         {
3407             s = *src++;
3408             d = *dst;
3409
3410             *dst++ = composite_over_8888_0565pixel (s, d);
3411             w--;
3412         }
3413
3414         /* It's a 8 pixel loop */
3415         while (w >= 8)
3416         {
3417             /* I'm loading unaligned because I'm not sure
3418              * about the address alignment.
3419              */
3420             xmm_src = load_128_unaligned ((__m128i*) src);
3421             xmm_dst = load_128_aligned ((__m128i*) dst);
3422
3423             /* Unpacking */
3424             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3425             unpack_565_128_4x128 (xmm_dst,
3426                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3427             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3428                                 &xmm_alpha_lo, &xmm_alpha_hi);
3429
3430             /* I'm loading next 4 pixels from memory
3431              * before to optimze the memory read.
3432              */
3433             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3434
3435             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3436                         &xmm_alpha_lo, &xmm_alpha_hi,
3437                         &xmm_dst0, &xmm_dst1);
3438
3439             /* Unpacking */
3440             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3441             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3442                                 &xmm_alpha_lo, &xmm_alpha_hi);
3443
3444             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3445                         &xmm_alpha_lo, &xmm_alpha_hi,
3446                         &xmm_dst2, &xmm_dst3);
3447
3448             save_128_aligned (
3449                 (__m128i*)dst, pack_565_4x128_128 (
3450                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3451
3452             w -= 8;
3453             dst += 8;
3454             src += 8;
3455         }
3456
3457         while (w--)
3458         {
3459             s = *src++;
3460             d = *dst;
3461
3462             *dst++ = composite_over_8888_0565pixel (s, d);
3463         }
3464     }
3465
3466     _mm_empty ();
3467 }
3468
3469 /* -----------------------------------------------------------------
3470  * composite_over_n_8_8888
3471  */
3472
3473 static void
3474 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3475                               pixman_op_t              op,
3476                               pixman_image_t *         src_image,
3477                               pixman_image_t *         mask_image,
3478                               pixman_image_t *         dst_image,
3479                               int32_t                  src_x,
3480                               int32_t                  src_y,
3481                               int32_t                  mask_x,
3482                               int32_t                  mask_y,
3483                               int32_t                  dest_x,
3484                               int32_t                  dest_y,
3485                               int32_t                  width,
3486                               int32_t                  height)
3487 {
3488     uint32_t src, srca;
3489     uint32_t *dst_line, *dst;
3490     uint8_t *mask_line, *mask;
3491     int dst_stride, mask_stride;
3492     int32_t w;
3493     uint32_t m, d;
3494
3495     __m128i xmm_src, xmm_alpha, xmm_def;
3496     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3497     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3498
3499     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3500
3501     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3502
3503     srca = src >> 24;
3504     if (src == 0)
3505         return;
3506
3507     PIXMAN_IMAGE_GET_LINE (
3508         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3509     PIXMAN_IMAGE_GET_LINE (
3510         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3511
3512     xmm_def = create_mask_2x32_128 (src, src);
3513     xmm_src = expand_pixel_32_1x128 (src);
3514     xmm_alpha = expand_alpha_1x128 (xmm_src);
3515     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3516     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3517
3518     while (height--)
3519     {
3520         dst = dst_line;
3521         dst_line += dst_stride;
3522         mask = mask_line;
3523         mask_line += mask_stride;
3524         w = width;
3525
3526         while (w && (unsigned long)dst & 15)
3527         {
3528             uint8_t m = *mask++;
3529
3530             if (m)
3531             {
3532                 d = *dst;
3533                 mmx_mask = expand_pixel_8_1x64 (m);
3534                 mmx_dest = unpack_32_1x64 (d);
3535
3536                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3537                                                    &mmx_alpha,
3538                                                    &mmx_mask,
3539                                                    &mmx_dest));
3540             }
3541
3542             w--;
3543             dst++;
3544         }
3545
3546         while (w >= 4)
3547         {
3548             m = *((uint32_t*)mask);
3549
3550             if (srca == 0xff && m == 0xffffffff)
3551             {
3552                 save_128_aligned ((__m128i*)dst, xmm_def);
3553             }
3554             else if (m)
3555             {
3556                 xmm_dst = load_128_aligned ((__m128i*) dst);
3557                 xmm_mask = unpack_32_1x128 (m);
3558                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3559
3560                 /* Unpacking */
3561                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3562                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3563
3564                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3565                                         &xmm_mask_lo, &xmm_mask_hi);
3566
3567                 in_over_2x128 (&xmm_src, &xmm_src,
3568                                &xmm_alpha, &xmm_alpha,
3569                                &xmm_mask_lo, &xmm_mask_hi,
3570                                &xmm_dst_lo, &xmm_dst_hi);
3571
3572                 save_128_aligned (
3573                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3574             }
3575
3576             w -= 4;
3577             dst += 4;
3578             mask += 4;
3579         }
3580
3581         while (w)
3582         {
3583             uint8_t m = *mask++;
3584
3585             if (m)
3586             {
3587                 d = *dst;
3588                 mmx_mask = expand_pixel_8_1x64 (m);
3589                 mmx_dest = unpack_32_1x64 (d);
3590
3591                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3592                                                    &mmx_alpha,
3593                                                    &mmx_mask,
3594                                                    &mmx_dest));
3595             }
3596
3597             w--;
3598             dst++;
3599         }
3600     }
3601
3602     _mm_empty ();
3603 }
3604
3605 /* ----------------------------------------------------------------
3606  * composite_over_n_8_8888
3607  */
3608
3609 pixman_bool_t
3610 pixman_fill_sse2 (uint32_t *bits,
3611                   int       stride,
3612                   int       bpp,
3613                   int       x,
3614                   int       y,
3615                   int       width,
3616                   int       height,
3617                   uint32_t  data)
3618 {
3619     uint32_t byte_width;
3620     uint8_t         *byte_line;
3621
3622     __m128i xmm_def;
3623
3624     if (bpp == 8)
3625     {
3626         uint8_t b;
3627         uint16_t w;
3628
3629         stride = stride * (int) sizeof (uint32_t) / 1;
3630         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3631         byte_width = width;
3632         stride *= 1;
3633
3634         b = data & 0xff;
3635         w = (b << 8) | b;
3636         data = (w << 16) | w;
3637     }
3638     else if (bpp == 16)
3639     {
3640         stride = stride * (int) sizeof (uint32_t) / 2;
3641         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3642         byte_width = 2 * width;
3643         stride *= 2;
3644
3645         data = (data & 0xffff) * 0x00010001;
3646     }
3647     else if (bpp == 32)
3648     {
3649         stride = stride * (int) sizeof (uint32_t) / 4;
3650         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3651         byte_width = 4 * width;
3652         stride *= 4;
3653     }
3654     else
3655     {
3656         return FALSE;
3657     }
3658
3659     xmm_def = create_mask_2x32_128 (data, data);
3660
3661     while (height--)
3662     {
3663         int w;
3664         uint8_t *d = byte_line;
3665         byte_line += stride;
3666         w = byte_width;
3667
3668         while (w >= 1 && ((unsigned long)d & 1))
3669         {
3670             *(uint8_t *)d = data;
3671             w -= 1;
3672             d += 1;
3673         }
3674
3675         while (w >= 2 && ((unsigned long)d & 3))
3676         {
3677             *(uint16_t *)d = data;
3678             w -= 2;
3679             d += 2;
3680         }
3681
3682         while (w >= 4 && ((unsigned long)d & 15))
3683         {
3684             *(uint32_t *)d = data;
3685
3686             w -= 4;
3687             d += 4;
3688         }
3689
3690         while (w >= 128)
3691         {
3692             save_128_aligned ((__m128i*)(d),     xmm_def);
3693             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3694             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3695             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3696             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3697             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3698             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3699             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3700
3701             d += 128;
3702             w -= 128;
3703         }
3704
3705         if (w >= 64)
3706         {
3707             save_128_aligned ((__m128i*)(d),     xmm_def);
3708             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3709             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3710             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3711
3712             d += 64;
3713             w -= 64;
3714         }
3715
3716         if (w >= 32)
3717         {
3718             save_128_aligned ((__m128i*)(d),     xmm_def);
3719             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3720
3721             d += 32;
3722             w -= 32;
3723         }
3724
3725         if (w >= 16)
3726         {
3727             save_128_aligned ((__m128i*)(d),     xmm_def);
3728
3729             d += 16;
3730             w -= 16;
3731         }
3732
3733         while (w >= 4)
3734         {
3735             *(uint32_t *)d = data;
3736
3737             w -= 4;
3738             d += 4;
3739         }
3740
3741         if (w >= 2)
3742         {
3743             *(uint16_t *)d = data;
3744             w -= 2;
3745             d += 2;
3746         }
3747
3748         if (w >= 1)
3749         {
3750             *(uint8_t *)d = data;
3751             w -= 1;
3752             d += 1;
3753         }
3754     }
3755
3756     _mm_empty ();
3757     return TRUE;
3758 }
3759
3760 static void
3761 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3762                              pixman_op_t              op,
3763                              pixman_image_t *         src_image,
3764                              pixman_image_t *         mask_image,
3765                              pixman_image_t *         dst_image,
3766                              int32_t                  src_x,
3767                              int32_t                  src_y,
3768                              int32_t                  mask_x,
3769                              int32_t                  mask_y,
3770                              int32_t                  dest_x,
3771                              int32_t                  dest_y,
3772                              int32_t                  width,
3773                              int32_t                  height)
3774 {
3775     uint32_t src, srca;
3776     uint32_t    *dst_line, *dst;
3777     uint8_t     *mask_line, *mask;
3778     int dst_stride, mask_stride;
3779     int32_t w;
3780     uint32_t m;
3781
3782     __m128i xmm_src, xmm_def;
3783     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3784
3785     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3786
3787     srca = src >> 24;
3788     if (src == 0)
3789     {
3790         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3791                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
3792                           dest_x, dest_y, width, height, 0);
3793         return;
3794     }
3795
3796     PIXMAN_IMAGE_GET_LINE (
3797         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3798     PIXMAN_IMAGE_GET_LINE (
3799         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3800
3801     xmm_def = create_mask_2x32_128 (src, src);
3802     xmm_src = expand_pixel_32_1x128 (src);
3803
3804     while (height--)
3805     {
3806         dst = dst_line;
3807         dst_line += dst_stride;
3808         mask = mask_line;
3809         mask_line += mask_stride;
3810         w = width;
3811
3812         while (w && (unsigned long)dst & 15)
3813         {
3814             uint8_t m = *mask++;
3815
3816             if (m)
3817             {
3818                 *dst = pack_1x64_32 (
3819                     pix_multiply_1x64 (
3820                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3821             }
3822             else
3823             {
3824                 *dst = 0;
3825             }
3826
3827             w--;
3828             dst++;
3829         }
3830
3831         while (w >= 4)
3832         {
3833             m = *((uint32_t*)mask);
3834
3835             if (srca == 0xff && m == 0xffffffff)
3836             {
3837                 save_128_aligned ((__m128i*)dst, xmm_def);
3838             }
3839             else if (m)
3840             {
3841                 xmm_mask = unpack_32_1x128 (m);
3842                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3843
3844                 /* Unpacking */
3845                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3846
3847                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3848                                         &xmm_mask_lo, &xmm_mask_hi);
3849
3850                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3851                                     &xmm_mask_lo, &xmm_mask_hi,
3852                                     &xmm_mask_lo, &xmm_mask_hi);
3853
3854                 save_128_aligned (
3855                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3856             }
3857             else
3858             {
3859                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3860             }
3861
3862             w -= 4;
3863             dst += 4;
3864             mask += 4;
3865         }
3866
3867         while (w)
3868         {
3869             uint8_t m = *mask++;
3870
3871             if (m)
3872             {
3873                 *dst = pack_1x64_32 (
3874                     pix_multiply_1x64 (
3875                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3876             }
3877             else
3878             {
3879                 *dst = 0;
3880             }
3881
3882             w--;
3883             dst++;
3884         }
3885     }
3886
3887     _mm_empty ();
3888 }
3889
3890 /*-----------------------------------------------------------------------
3891  * composite_over_n_8_0565
3892  */
3893
3894 static void
3895 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3896                               pixman_op_t              op,
3897                               pixman_image_t *         src_image,
3898                               pixman_image_t *         mask_image,
3899                               pixman_image_t *         dst_image,
3900                               int32_t                  src_x,
3901                               int32_t                  src_y,
3902                               int32_t                  mask_x,
3903                               int32_t                  mask_y,
3904                               int32_t                  dest_x,
3905                               int32_t                  dest_y,
3906                               int32_t                  width,
3907                               int32_t                  height)
3908 {
3909     uint32_t src, srca;
3910     uint16_t    *dst_line, *dst, d;
3911     uint8_t     *mask_line, *mask;
3912     int dst_stride, mask_stride;
3913     int32_t w;
3914     uint32_t m;
3915     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3916
3917     __m128i xmm_src, xmm_alpha;
3918     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3919     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3920
3921     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3922
3923     srca = src >> 24;
3924     if (src == 0)
3925         return;
3926
3927     PIXMAN_IMAGE_GET_LINE (
3928         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3929     PIXMAN_IMAGE_GET_LINE (
3930         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3931
3932     xmm_src = expand_pixel_32_1x128 (src);
3933     xmm_alpha = expand_alpha_1x128 (xmm_src);
3934     mmx_src = _mm_movepi64_pi64 (xmm_src);
3935     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3936
3937     while (height--)
3938     {
3939         dst = dst_line;
3940         dst_line += dst_stride;
3941         mask = mask_line;
3942         mask_line += mask_stride;
3943         w = width;
3944
3945         while (w && (unsigned long)dst & 15)
3946         {
3947             m = *mask++;
3948
3949             if (m)
3950             {
3951                 d = *dst;
3952                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
3953                 mmx_dest = expand565_16_1x64 (d);
3954
3955                 *dst = pack_565_32_16 (
3956                     pack_1x64_32 (
3957                         in_over_1x64 (
3958                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3959             }
3960
3961             w--;
3962             dst++;
3963         }
3964
3965         while (w >= 8)
3966         {
3967             xmm_dst = load_128_aligned ((__m128i*) dst);
3968             unpack_565_128_4x128 (xmm_dst,
3969                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3970
3971             m = *((uint32_t*)mask);
3972             mask += 4;
3973
3974             if (m)
3975             {
3976                 xmm_mask = unpack_32_1x128 (m);
3977                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3978
3979                 /* Unpacking */
3980                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3981
3982                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3983                                         &xmm_mask_lo, &xmm_mask_hi);
3984
3985                 in_over_2x128 (&xmm_src, &xmm_src,
3986                                &xmm_alpha, &xmm_alpha,
3987                                &xmm_mask_lo, &xmm_mask_hi,
3988                                &xmm_dst0, &xmm_dst1);
3989             }
3990
3991             m = *((uint32_t*)mask);
3992             mask += 4;
3993
3994             if (m)
3995             {
3996                 xmm_mask = unpack_32_1x128 (m);
3997                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3998
3999                 /* Unpacking */
4000                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4001
4002                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4003                                         &xmm_mask_lo, &xmm_mask_hi);
4004                 in_over_2x128 (&xmm_src, &xmm_src,
4005                                &xmm_alpha, &xmm_alpha,
4006                                &xmm_mask_lo, &xmm_mask_hi,
4007                                &xmm_dst2, &xmm_dst3);
4008             }
4009
4010             save_128_aligned (
4011                 (__m128i*)dst, pack_565_4x128_128 (
4012                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4013
4014             w -= 8;
4015             dst += 8;
4016         }
4017
4018         while (w)
4019         {
4020             m = *mask++;
4021
4022             if (m)
4023             {
4024                 d = *dst;
4025                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4026                 mmx_dest = expand565_16_1x64 (d);
4027
4028                 *dst = pack_565_32_16 (
4029                     pack_1x64_32 (
4030                         in_over_1x64 (
4031                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4032             }
4033
4034             w--;
4035             dst++;
4036         }
4037     }
4038
4039     _mm_empty ();
4040 }
4041
4042 /* -----------------------------------------------------------------------
4043  * composite_over_pixbuf_0565
4044  */
4045
4046 static void
4047 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4048                                  pixman_op_t              op,
4049                                  pixman_image_t *         src_image,
4050                                  pixman_image_t *         mask_image,
4051                                  pixman_image_t *         dst_image,
4052                                  int32_t                  src_x,
4053                                  int32_t                  src_y,
4054                                  int32_t                  mask_x,
4055                                  int32_t                  mask_y,
4056                                  int32_t                  dest_x,
4057                                  int32_t                  dest_y,
4058                                  int32_t                  width,
4059                                  int32_t                  height)
4060 {
4061     uint16_t    *dst_line, *dst, d;
4062     uint32_t    *src_line, *src, s;
4063     int dst_stride, src_stride;
4064     int32_t w;
4065     uint32_t opaque, zero;
4066
4067     __m64 ms;
4068     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4069     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4070
4071     PIXMAN_IMAGE_GET_LINE (
4072         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4073     PIXMAN_IMAGE_GET_LINE (
4074         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4075
4076 #if 0
4077     /* FIXME
4078      *
4079      * I copy the code from MMX one and keep the fixme.
4080      * If it's a problem there, probably is a problem here.
4081      */
4082     assert (src_image->drawable == mask_image->drawable);
4083 #endif
4084
4085     while (height--)
4086     {
4087         dst = dst_line;
4088         dst_line += dst_stride;
4089         src = src_line;
4090         src_line += src_stride;
4091         w = width;
4092
4093         while (w && (unsigned long)dst & 15)
4094         {
4095             s = *src++;
4096             d = *dst;
4097
4098             ms = unpack_32_1x64 (s);
4099
4100             *dst++ = pack_565_32_16 (
4101                 pack_1x64_32 (
4102                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4103             w--;
4104         }
4105
4106         while (w >= 8)
4107         {
4108             /* First round */
4109             xmm_src = load_128_unaligned ((__m128i*)src);
4110             xmm_dst = load_128_aligned  ((__m128i*)dst);
4111
4112             opaque = is_opaque (xmm_src);
4113             zero = is_zero (xmm_src);
4114
4115             unpack_565_128_4x128 (xmm_dst,
4116                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4117             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4118
4119             /* preload next round*/
4120             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4121
4122             if (opaque)
4123             {
4124                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4125                                      &xmm_dst0, &xmm_dst1);
4126             }
4127             else if (!zero)
4128             {
4129                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4130                                         &xmm_dst0, &xmm_dst1);
4131             }
4132
4133             /* Second round */
4134             opaque = is_opaque (xmm_src);
4135             zero = is_zero (xmm_src);
4136
4137             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4138
4139             if (opaque)
4140             {
4141                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4142                                      &xmm_dst2, &xmm_dst3);
4143             }
4144             else if (!zero)
4145             {
4146                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4147                                         &xmm_dst2, &xmm_dst3);
4148             }
4149
4150             save_128_aligned (
4151                 (__m128i*)dst, pack_565_4x128_128 (
4152                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4153
4154             w -= 8;
4155             src += 8;
4156             dst += 8;
4157         }
4158
4159         while (w)
4160         {
4161             s = *src++;
4162             d = *dst;
4163
4164             ms = unpack_32_1x64 (s);
4165
4166             *dst++ = pack_565_32_16 (
4167                 pack_1x64_32 (
4168                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4169             w--;
4170         }
4171     }
4172
4173     _mm_empty ();
4174 }
4175
4176 /* -------------------------------------------------------------------------
4177  * composite_over_pixbuf_8888
4178  */
4179
4180 static void
4181 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4182                                  pixman_op_t              op,
4183                                  pixman_image_t *         src_image,
4184                                  pixman_image_t *         mask_image,
4185                                  pixman_image_t *         dst_image,
4186                                  int32_t                  src_x,
4187                                  int32_t                  src_y,
4188                                  int32_t                  mask_x,
4189                                  int32_t                  mask_y,
4190                                  int32_t                  dest_x,
4191                                  int32_t                  dest_y,
4192                                  int32_t                  width,
4193                                  int32_t                  height)
4194 {
4195     uint32_t    *dst_line, *dst, d;
4196     uint32_t    *src_line, *src, s;
4197     int dst_stride, src_stride;
4198     int32_t w;
4199     uint32_t opaque, zero;
4200
4201     __m128i xmm_src_lo, xmm_src_hi;
4202     __m128i xmm_dst_lo, xmm_dst_hi;
4203
4204     PIXMAN_IMAGE_GET_LINE (
4205         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4206     PIXMAN_IMAGE_GET_LINE (
4207         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4208
4209 #if 0
4210     /* FIXME
4211      *
4212      * I copy the code from MMX one and keep the fixme.
4213      * If it's a problem there, probably is a problem here.
4214      */
4215     assert (src_image->drawable == mask_image->drawable);
4216 #endif
4217
4218     while (height--)
4219     {
4220         dst = dst_line;
4221         dst_line += dst_stride;
4222         src = src_line;
4223         src_line += src_stride;
4224         w = width;
4225
4226         while (w && (unsigned long)dst & 15)
4227         {
4228             s = *src++;
4229             d = *dst;
4230
4231             *dst++ = pack_1x64_32 (
4232                 over_rev_non_pre_1x64 (
4233                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4234
4235             w--;
4236         }
4237
4238         while (w >= 4)
4239         {
4240             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4241
4242             opaque = is_opaque (xmm_src_hi);
4243             zero = is_zero (xmm_src_hi);
4244
4245             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4246
4247             if (opaque)
4248             {
4249                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4250                                      &xmm_dst_lo, &xmm_dst_hi);
4251
4252                 save_128_aligned (
4253                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4254             }
4255             else if (!zero)
4256             {
4257                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4258
4259                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4260
4261                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4262                                         &xmm_dst_lo, &xmm_dst_hi);
4263
4264                 save_128_aligned (
4265                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4266             }
4267
4268             w -= 4;
4269             dst += 4;
4270             src += 4;
4271         }
4272
4273         while (w)
4274         {
4275             s = *src++;
4276             d = *dst;
4277
4278             *dst++ = pack_1x64_32 (
4279                 over_rev_non_pre_1x64 (
4280                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4281
4282             w--;
4283         }
4284     }
4285
4286     _mm_empty ();
4287 }
4288
4289 /* -------------------------------------------------------------------------------------------------
4290  * composite_over_n_8888_0565_ca
4291  */
4292
4293 static void
4294 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4295                                     pixman_op_t              op,
4296                                     pixman_image_t *         src_image,
4297                                     pixman_image_t *         mask_image,
4298                                     pixman_image_t *         dst_image,
4299                                     int32_t                  src_x,
4300                                     int32_t                  src_y,
4301                                     int32_t                  mask_x,
4302                                     int32_t                  mask_y,
4303                                     int32_t                  dest_x,
4304                                     int32_t                  dest_y,
4305                                     int32_t                  width,
4306                                     int32_t                  height)
4307 {
4308     uint32_t src;
4309     uint16_t    *dst_line, *dst, d;
4310     uint32_t    *mask_line, *mask, m;
4311     int dst_stride, mask_stride;
4312     int w;
4313     uint32_t pack_cmp;
4314
4315     __m128i xmm_src, xmm_alpha;
4316     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4317     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4318
4319     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4320
4321     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4322
4323     if (src == 0)
4324         return;
4325
4326     PIXMAN_IMAGE_GET_LINE (
4327         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4328     PIXMAN_IMAGE_GET_LINE (
4329         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4330
4331     xmm_src = expand_pixel_32_1x128 (src);
4332     xmm_alpha = expand_alpha_1x128 (xmm_src);
4333     mmx_src = _mm_movepi64_pi64 (xmm_src);
4334     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4335
4336     while (height--)
4337     {
4338         w = width;
4339         mask = mask_line;
4340         dst = dst_line;
4341         mask_line += mask_stride;
4342         dst_line += dst_stride;
4343
4344         while (w && ((unsigned long)dst & 15))
4345         {
4346             m = *(uint32_t *) mask;
4347
4348             if (m)
4349             {
4350                 d = *dst;
4351                 mmx_mask = unpack_32_1x64 (m);
4352                 mmx_dest = expand565_16_1x64 (d);
4353
4354                 *dst = pack_565_32_16 (
4355                     pack_1x64_32 (
4356                         in_over_1x64 (
4357                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4358             }
4359
4360             w--;
4361             dst++;
4362             mask++;
4363         }
4364
4365         while (w >= 8)
4366         {
4367             /* First round */
4368             xmm_mask = load_128_unaligned ((__m128i*)mask);
4369             xmm_dst = load_128_aligned ((__m128i*)dst);
4370
4371             pack_cmp = _mm_movemask_epi8 (
4372                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4373
4374             unpack_565_128_4x128 (xmm_dst,
4375                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4376             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4377
4378             /* preload next round */
4379             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4380
4381             /* preload next round */
4382             if (pack_cmp != 0xffff)
4383             {
4384                 in_over_2x128 (&xmm_src, &xmm_src,
4385                                &xmm_alpha, &xmm_alpha,
4386                                &xmm_mask_lo, &xmm_mask_hi,
4387                                &xmm_dst0, &xmm_dst1);
4388             }
4389
4390             /* Second round */
4391             pack_cmp = _mm_movemask_epi8 (
4392                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4393
4394             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4395
4396             if (pack_cmp != 0xffff)
4397             {
4398                 in_over_2x128 (&xmm_src, &xmm_src,
4399                                &xmm_alpha, &xmm_alpha,
4400                                &xmm_mask_lo, &xmm_mask_hi,
4401                                &xmm_dst2, &xmm_dst3);
4402             }
4403
4404             save_128_aligned (
4405                 (__m128i*)dst, pack_565_4x128_128 (
4406                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4407
4408             w -= 8;
4409             dst += 8;
4410             mask += 8;
4411         }
4412
4413         while (w)
4414         {
4415             m = *(uint32_t *) mask;
4416
4417             if (m)
4418             {
4419                 d = *dst;
4420                 mmx_mask = unpack_32_1x64 (m);
4421                 mmx_dest = expand565_16_1x64 (d);
4422
4423                 *dst = pack_565_32_16 (
4424                     pack_1x64_32 (
4425                         in_over_1x64 (
4426                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4427             }
4428
4429             w--;
4430             dst++;
4431             mask++;
4432         }
4433     }
4434
4435     _mm_empty ();
4436 }
4437
4438 /* -----------------------------------------------------------------------
4439  * composite_in_n_8_8
4440  */
4441
4442 static void
4443 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4444                          pixman_op_t              op,
4445                          pixman_image_t *         src_image,
4446                          pixman_image_t *         mask_image,
4447                          pixman_image_t *         dst_image,
4448                          int32_t                  src_x,
4449                          int32_t                  src_y,
4450                          int32_t                  mask_x,
4451                          int32_t                  mask_y,
4452                          int32_t                  dest_x,
4453                          int32_t                  dest_y,
4454                          int32_t                  width,
4455                          int32_t                  height)
4456 {
4457     uint8_t     *dst_line, *dst;
4458     uint8_t     *mask_line, *mask;
4459     int dst_stride, mask_stride;
4460     uint32_t d, m;
4461     uint32_t src;
4462     uint8_t sa;
4463     int32_t w;
4464
4465     __m128i xmm_alpha;
4466     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4467     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4468
4469     PIXMAN_IMAGE_GET_LINE (
4470         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4471     PIXMAN_IMAGE_GET_LINE (
4472         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4473
4474     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4475
4476     sa = src >> 24;
4477
4478     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4479
4480     while (height--)
4481     {
4482         dst = dst_line;
4483         dst_line += dst_stride;
4484         mask = mask_line;
4485         mask_line += mask_stride;
4486         w = width;
4487
4488         while (w && ((unsigned long)dst & 15))
4489         {
4490             m = (uint32_t) *mask++;
4491             d = (uint32_t) *dst;
4492
4493             *dst++ = (uint8_t) pack_1x64_32 (
4494                 pix_multiply_1x64 (
4495                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4496                                        unpack_32_1x64 (m)),
4497                     unpack_32_1x64 (d)));
4498             w--;
4499         }
4500
4501         while (w >= 16)
4502         {
4503             xmm_mask = load_128_unaligned ((__m128i*)mask);
4504             xmm_dst = load_128_aligned ((__m128i*)dst);
4505
4506             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4507             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4508
4509             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4510                                 &xmm_mask_lo, &xmm_mask_hi,
4511                                 &xmm_mask_lo, &xmm_mask_hi);
4512
4513             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4514                                 &xmm_dst_lo, &xmm_dst_hi,
4515                                 &xmm_dst_lo, &xmm_dst_hi);
4516
4517             save_128_aligned (
4518                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4519
4520             mask += 16;
4521             dst += 16;
4522             w -= 16;
4523         }
4524
4525         while (w)
4526         {
4527             m = (uint32_t) *mask++;
4528             d = (uint32_t) *dst;
4529
4530             *dst++ = (uint8_t) pack_1x64_32 (
4531                 pix_multiply_1x64 (
4532                     pix_multiply_1x64 (
4533                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4534                     unpack_32_1x64 (d)));
4535             w--;
4536         }
4537     }
4538
4539     _mm_empty ();
4540 }
4541
4542 /* -----------------------------------------------------------------------
4543  * composite_in_n_8
4544  */
4545
4546 static void
4547 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4548                        pixman_op_t              op,
4549                        pixman_image_t *         src_image,
4550                        pixman_image_t *         mask_image,
4551                        pixman_image_t *         dst_image,
4552                        int32_t                  src_x,
4553                        int32_t                  src_y,
4554                        int32_t                  mask_x,
4555                        int32_t                  mask_y,
4556                        int32_t                  dest_x,
4557                        int32_t                  dest_y,
4558                        int32_t                  width,
4559                        int32_t                  height)
4560 {
4561     uint8_t     *dst_line, *dst;
4562     int dst_stride;
4563     uint32_t d;
4564     uint32_t src;
4565     int32_t w;
4566
4567     __m128i xmm_alpha;
4568     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4569
4570     PIXMAN_IMAGE_GET_LINE (
4571         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4572
4573     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4574
4575     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4576
4577     src = src >> 24;
4578
4579     if (src == 0xff)
4580         return;
4581
4582     if (src == 0x00)
4583     {
4584         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4585                      8, dest_x, dest_y, width, height, src);
4586
4587         return;
4588     }
4589
4590     while (height--)
4591     {
4592         dst = dst_line;
4593         dst_line += dst_stride;
4594         w = width;
4595
4596         while (w && ((unsigned long)dst & 15))
4597         {
4598             d = (uint32_t) *dst;
4599
4600             *dst++ = (uint8_t) pack_1x64_32 (
4601                 pix_multiply_1x64 (
4602                     _mm_movepi64_pi64 (xmm_alpha),
4603                     unpack_32_1x64 (d)));
4604             w--;
4605         }
4606
4607         while (w >= 16)
4608         {
4609             xmm_dst = load_128_aligned ((__m128i*)dst);
4610
4611             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4612             
4613             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4614                                 &xmm_dst_lo, &xmm_dst_hi,
4615                                 &xmm_dst_lo, &xmm_dst_hi);
4616
4617             save_128_aligned (
4618                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4619
4620             dst += 16;
4621             w -= 16;
4622         }
4623
4624         while (w)
4625         {
4626             d = (uint32_t) *dst;
4627
4628             *dst++ = (uint8_t) pack_1x64_32 (
4629                 pix_multiply_1x64 (
4630                     _mm_movepi64_pi64 (xmm_alpha),
4631                     unpack_32_1x64 (d)));
4632             w--;
4633         }
4634     }
4635
4636     _mm_empty ();
4637 }
4638
4639 /* ---------------------------------------------------------------------------
4640  * composite_in_8_8
4641  */
4642
4643 static void
4644 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4645                        pixman_op_t              op,
4646                        pixman_image_t *         src_image,
4647                        pixman_image_t *         mask_image,
4648                        pixman_image_t *         dst_image,
4649                        int32_t                  src_x,
4650                        int32_t                  src_y,
4651                        int32_t                  mask_x,
4652                        int32_t                  mask_y,
4653                        int32_t                  dest_x,
4654                        int32_t                  dest_y,
4655                        int32_t                  width,
4656                        int32_t                  height)
4657 {
4658     uint8_t     *dst_line, *dst;
4659     uint8_t     *src_line, *src;
4660     int src_stride, dst_stride;
4661     int32_t w;
4662     uint32_t s, d;
4663
4664     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4665     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4666
4667     PIXMAN_IMAGE_GET_LINE (
4668         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4669     PIXMAN_IMAGE_GET_LINE (
4670         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4671
4672     while (height--)
4673     {
4674         dst = dst_line;
4675         dst_line += dst_stride;
4676         src = src_line;
4677         src_line += src_stride;
4678         w = width;
4679
4680         while (w && ((unsigned long)dst & 15))
4681         {
4682             s = (uint32_t) *src++;
4683             d = (uint32_t) *dst;
4684
4685             *dst++ = (uint8_t) pack_1x64_32 (
4686                 pix_multiply_1x64 (
4687                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4688             w--;
4689         }
4690
4691         while (w >= 16)
4692         {
4693             xmm_src = load_128_unaligned ((__m128i*)src);
4694             xmm_dst = load_128_aligned ((__m128i*)dst);
4695
4696             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4697             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4698
4699             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4700                                 &xmm_dst_lo, &xmm_dst_hi,
4701                                 &xmm_dst_lo, &xmm_dst_hi);
4702
4703             save_128_aligned (
4704                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4705
4706             src += 16;
4707             dst += 16;
4708             w -= 16;
4709         }
4710
4711         while (w)
4712         {
4713             s = (uint32_t) *src++;
4714             d = (uint32_t) *dst;
4715
4716             *dst++ = (uint8_t) pack_1x64_32 (
4717                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4718             w--;
4719         }
4720     }
4721
4722     _mm_empty ();
4723 }
4724
4725 /* -------------------------------------------------------------------------
4726  * composite_add_n_8_8
4727  */
4728
4729 static void
4730 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4731                           pixman_op_t              op,
4732                           pixman_image_t *         src_image,
4733                           pixman_image_t *         mask_image,
4734                           pixman_image_t *         dst_image,
4735                           int32_t                  src_x,
4736                           int32_t                  src_y,
4737                           int32_t                  mask_x,
4738                           int32_t                  mask_y,
4739                           int32_t                  dest_x,
4740                           int32_t                  dest_y,
4741                           int32_t                  width,
4742                           int32_t                  height)
4743 {
4744     uint8_t     *dst_line, *dst;
4745     uint8_t     *mask_line, *mask;
4746     int dst_stride, mask_stride;
4747     int32_t w;
4748     uint32_t src;
4749     uint8_t sa;
4750     uint32_t m, d;
4751
4752     __m128i xmm_alpha;
4753     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4754     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4755
4756     PIXMAN_IMAGE_GET_LINE (
4757         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4758     PIXMAN_IMAGE_GET_LINE (
4759         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4760
4761     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4762
4763     sa = src >> 24;
4764
4765     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4766
4767     while (height--)
4768     {
4769         dst = dst_line;
4770         dst_line += dst_stride;
4771         mask = mask_line;
4772         mask_line += mask_stride;
4773         w = width;
4774
4775         while (w && ((unsigned long)dst & 15))
4776         {
4777             m = (uint32_t) *mask++;
4778             d = (uint32_t) *dst;
4779
4780             *dst++ = (uint8_t) pack_1x64_32 (
4781                 _mm_adds_pu16 (
4782                     pix_multiply_1x64 (
4783                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4784                     unpack_32_1x64 (d)));
4785             w--;
4786         }
4787
4788         while (w >= 16)
4789         {
4790             xmm_mask = load_128_unaligned ((__m128i*)mask);
4791             xmm_dst = load_128_aligned ((__m128i*)dst);
4792
4793             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4794             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4795
4796             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4797                                 &xmm_mask_lo, &xmm_mask_hi,
4798                                 &xmm_mask_lo, &xmm_mask_hi);
4799
4800             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4801             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4802
4803             save_128_aligned (
4804                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4805
4806             mask += 16;
4807             dst += 16;
4808             w -= 16;
4809         }
4810
4811         while (w)
4812         {
4813             m = (uint32_t) *mask++;
4814             d = (uint32_t) *dst;
4815
4816             *dst++ = (uint8_t) pack_1x64_32 (
4817                 _mm_adds_pu16 (
4818                     pix_multiply_1x64 (
4819                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4820                     unpack_32_1x64 (d)));
4821
4822             w--;
4823         }
4824     }
4825
4826     _mm_empty ();
4827 }
4828
4829 /* -------------------------------------------------------------------------
4830  * composite_add_n_8_8
4831  */
4832
4833 static void
4834 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4835                         pixman_op_t              op,
4836                         pixman_image_t *         src_image,
4837                         pixman_image_t *         mask_image,
4838                         pixman_image_t *         dst_image,
4839                         int32_t                  src_x,
4840                         int32_t                  src_y,
4841                         int32_t                  mask_x,
4842                         int32_t                  mask_y,
4843                         int32_t                  dest_x,
4844                         int32_t                  dest_y,
4845                         int32_t                  width,
4846                         int32_t                  height)
4847 {
4848     uint8_t     *dst_line, *dst;
4849     int dst_stride;
4850     int32_t w;
4851     uint32_t src;
4852
4853     __m128i xmm_src;
4854
4855     PIXMAN_IMAGE_GET_LINE (
4856         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4857
4858     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4859
4860     src >>= 24;
4861
4862     if (src == 0x00)
4863         return;
4864
4865     if (src == 0xff)
4866     {
4867         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4868                      8, dest_x, dest_y, width, height, 0xff);
4869
4870         return;
4871     }
4872
4873     src = (src << 24) | (src << 16) | (src << 8) | src;
4874     xmm_src = _mm_set_epi32 (src, src, src, src);
4875
4876     while (height--)
4877     {
4878         dst = dst_line;
4879         dst_line += dst_stride;
4880         w = width;
4881
4882         while (w && ((unsigned long)dst & 15))
4883         {
4884             *dst = (uint8_t)_mm_cvtsi64_si32 (
4885                 _mm_adds_pu8 (
4886                     _mm_movepi64_pi64 (xmm_src),
4887                     _mm_cvtsi32_si64 (*dst)));
4888
4889             w--;
4890             dst++;
4891         }
4892
4893         while (w >= 16)
4894         {
4895             save_128_aligned (
4896                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4897
4898             dst += 16;
4899             w -= 16;
4900         }
4901
4902         while (w)
4903         {
4904             *dst = (uint8_t)_mm_cvtsi64_si32 (
4905                 _mm_adds_pu8 (
4906                     _mm_movepi64_pi64 (xmm_src),
4907                     _mm_cvtsi32_si64 (*dst)));
4908
4909             w--;
4910             dst++;
4911         }
4912     }
4913
4914     _mm_empty ();
4915 }
4916
4917 /* ----------------------------------------------------------------------
4918  * composite_add_8_8
4919  */
4920
4921 static void
4922 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4923                         pixman_op_t              op,
4924                         pixman_image_t *         src_image,
4925                         pixman_image_t *         mask_image,
4926                         pixman_image_t *         dst_image,
4927                         int32_t                  src_x,
4928                         int32_t                  src_y,
4929                         int32_t                  mask_x,
4930                         int32_t                  mask_y,
4931                         int32_t                  dest_x,
4932                         int32_t                  dest_y,
4933                         int32_t                  width,
4934                         int32_t                  height)
4935 {
4936     uint8_t     *dst_line, *dst;
4937     uint8_t     *src_line, *src;
4938     int dst_stride, src_stride;
4939     int32_t w;
4940     uint16_t t;
4941
4942     PIXMAN_IMAGE_GET_LINE (
4943         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4944     PIXMAN_IMAGE_GET_LINE (
4945         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4946
4947     while (height--)
4948     {
4949         dst = dst_line;
4950         src = src_line;
4951
4952         dst_line += dst_stride;
4953         src_line += src_stride;
4954         w = width;
4955
4956         /* Small head */
4957         while (w && (unsigned long)dst & 3)
4958         {
4959             t = (*dst) + (*src++);
4960             *dst++ = t | (0 - (t >> 8));
4961             w--;
4962         }
4963
4964         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4965
4966         /* Small tail */
4967         dst += w & 0xfffc;
4968         src += w & 0xfffc;
4969
4970         w &= 3;
4971
4972         while (w)
4973         {
4974             t = (*dst) + (*src++);
4975             *dst++ = t | (0 - (t >> 8));
4976             w--;
4977         }
4978     }
4979
4980     _mm_empty ();
4981 }
4982
4983 /* ---------------------------------------------------------------------
4984  * composite_add_8888_8888
4985  */
4986 static void
4987 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4988                               pixman_op_t              op,
4989                               pixman_image_t *         src_image,
4990                               pixman_image_t *         mask_image,
4991                               pixman_image_t *         dst_image,
4992                               int32_t                  src_x,
4993                               int32_t                  src_y,
4994                               int32_t                  mask_x,
4995                               int32_t                  mask_y,
4996                               int32_t                  dest_x,
4997                               int32_t                  dest_y,
4998                               int32_t                  width,
4999                               int32_t                  height)
5000 {
5001     uint32_t    *dst_line, *dst;
5002     uint32_t    *src_line, *src;
5003     int dst_stride, src_stride;
5004
5005     PIXMAN_IMAGE_GET_LINE (
5006         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5007     PIXMAN_IMAGE_GET_LINE (
5008         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5009
5010     while (height--)
5011     {
5012         dst = dst_line;
5013         dst_line += dst_stride;
5014         src = src_line;
5015         src_line += src_stride;
5016
5017         core_combine_add_u_sse2 (dst, src, NULL, width);
5018     }
5019
5020     _mm_empty ();
5021 }
5022
5023 /* -------------------------------------------------------------------------------------------------
5024  * sse2_composite_copy_area
5025  */
5026
5027 static pixman_bool_t
5028 pixman_blt_sse2 (uint32_t *src_bits,
5029                  uint32_t *dst_bits,
5030                  int       src_stride,
5031                  int       dst_stride,
5032                  int       src_bpp,
5033                  int       dst_bpp,
5034                  int       src_x,
5035                  int       src_y,
5036                  int       dst_x,
5037                  int       dst_y,
5038                  int       width,
5039                  int       height)
5040 {
5041     uint8_t *   src_bytes;
5042     uint8_t *   dst_bytes;
5043     int byte_width;
5044
5045     if (src_bpp != dst_bpp)
5046         return FALSE;
5047
5048     if (src_bpp == 16)
5049     {
5050         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5051         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5052         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5053         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5054         byte_width = 2 * width;
5055         src_stride *= 2;
5056         dst_stride *= 2;
5057     }
5058     else if (src_bpp == 32)
5059     {
5060         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5061         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5062         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5063         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5064         byte_width = 4 * width;
5065         src_stride *= 4;
5066         dst_stride *= 4;
5067     }
5068     else
5069     {
5070         return FALSE;
5071     }
5072
5073     while (height--)
5074     {
5075         int w;
5076         uint8_t *s = src_bytes;
5077         uint8_t *d = dst_bytes;
5078         src_bytes += src_stride;
5079         dst_bytes += dst_stride;
5080         w = byte_width;
5081
5082         while (w >= 2 && ((unsigned long)d & 3))
5083         {
5084             *(uint16_t *)d = *(uint16_t *)s;
5085             w -= 2;
5086             s += 2;
5087             d += 2;
5088         }
5089
5090         while (w >= 4 && ((unsigned long)d & 15))
5091         {
5092             *(uint32_t *)d = *(uint32_t *)s;
5093
5094             w -= 4;
5095             s += 4;
5096             d += 4;
5097         }
5098
5099         while (w >= 64)
5100         {
5101             __m128i xmm0, xmm1, xmm2, xmm3;
5102
5103             xmm0 = load_128_unaligned ((__m128i*)(s));
5104             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5105             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5106             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5107
5108             save_128_aligned ((__m128i*)(d),    xmm0);
5109             save_128_aligned ((__m128i*)(d + 16), xmm1);
5110             save_128_aligned ((__m128i*)(d + 32), xmm2);
5111             save_128_aligned ((__m128i*)(d + 48), xmm3);
5112
5113             s += 64;
5114             d += 64;
5115             w -= 64;
5116         }
5117
5118         while (w >= 16)
5119         {
5120             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5121
5122             w -= 16;
5123             d += 16;
5124             s += 16;
5125         }
5126
5127         while (w >= 4)
5128         {
5129             *(uint32_t *)d = *(uint32_t *)s;
5130
5131             w -= 4;
5132             s += 4;
5133             d += 4;
5134         }
5135
5136         if (w >= 2)
5137         {
5138             *(uint16_t *)d = *(uint16_t *)s;
5139             w -= 2;
5140             s += 2;
5141             d += 2;
5142         }
5143     }
5144
5145     _mm_empty ();
5146
5147     return TRUE;
5148 }
5149
5150 static void
5151 sse2_composite_copy_area (pixman_implementation_t *imp,
5152                           pixman_op_t              op,
5153                           pixman_image_t *         src_image,
5154                           pixman_image_t *         mask_image,
5155                           pixman_image_t *         dst_image,
5156                           int32_t                  src_x,
5157                           int32_t                  src_y,
5158                           int32_t                  mask_x,
5159                           int32_t                  mask_y,
5160                           int32_t                  dest_x,
5161                           int32_t                  dest_y,
5162                           int32_t                  width,
5163                           int32_t                  height)
5164 {
5165     pixman_blt_sse2 (src_image->bits.bits,
5166                      dst_image->bits.bits,
5167                      src_image->bits.rowstride,
5168                      dst_image->bits.rowstride,
5169                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5170                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5171                      src_x, src_y, dest_x, dest_y, width, height);
5172 }
5173
5174 static void
5175 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5176                                  pixman_op_t              op,
5177                                  pixman_image_t *         src_image,
5178                                  pixman_image_t *         mask_image,
5179                                  pixman_image_t *         dst_image,
5180                                  int32_t                  src_x,
5181                                  int32_t                  src_y,
5182                                  int32_t                  mask_x,
5183                                  int32_t                  mask_y,
5184                                  int32_t                  dest_x,
5185                                  int32_t                  dest_y,
5186                                  int32_t                  width,
5187                                  int32_t                  height)
5188 {
5189     uint32_t    *src, *src_line, s;
5190     uint32_t    *dst, *dst_line, d;
5191     uint8_t         *mask, *mask_line;
5192     uint32_t m;
5193     int src_stride, mask_stride, dst_stride;
5194     int32_t w;
5195     __m64 ms;
5196
5197     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5198     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5199     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5200
5201     PIXMAN_IMAGE_GET_LINE (
5202         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5203     PIXMAN_IMAGE_GET_LINE (
5204         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5205     PIXMAN_IMAGE_GET_LINE (
5206         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5207
5208     while (height--)
5209     {
5210         src = src_line;
5211         src_line += src_stride;
5212         dst = dst_line;
5213         dst_line += dst_stride;
5214         mask = mask_line;
5215         mask_line += mask_stride;
5216
5217         w = width;
5218
5219         while (w && (unsigned long)dst & 15)
5220         {
5221             s = 0xff000000 | *src++;
5222             m = (uint32_t) *mask++;
5223             d = *dst;
5224             ms = unpack_32_1x64 (s);
5225
5226             if (m != 0xff)
5227             {
5228                 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5229                 __m64 md = unpack_32_1x64 (d);
5230
5231                 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5232             }
5233
5234             *dst++ = pack_1x64_32 (ms);
5235             w--;
5236         }
5237
5238         while (w >= 4)
5239         {
5240             m = *(uint32_t*) mask;
5241             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5242
5243             if (m == 0xffffffff)
5244             {
5245                 save_128_aligned ((__m128i*)dst, xmm_src);
5246             }
5247             else
5248             {
5249                 xmm_dst = load_128_aligned ((__m128i*)dst);
5250
5251                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5252
5253                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5254                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5255                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5256
5257                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5258
5259                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5260
5261                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5262             }
5263
5264             src += 4;
5265             dst += 4;
5266             mask += 4;
5267             w -= 4;
5268         }
5269
5270         while (w)
5271         {
5272             m = (uint32_t) *mask++;
5273
5274             if (m)
5275             {
5276                 s = 0xff000000 | *src;
5277
5278                 if (m == 0xff)
5279                 {
5280                     *dst = s;
5281                 }
5282                 else
5283                 {
5284                     __m64 ma, md, ms;
5285
5286                     d = *dst;
5287
5288                     ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5289                     md = unpack_32_1x64 (d);
5290                     ms = unpack_32_1x64 (s);
5291
5292                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5293                 }
5294
5295             }
5296
5297             src++;
5298             dst++;
5299             w--;
5300         }
5301     }
5302
5303     _mm_empty ();
5304 }
5305
5306 static void
5307 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5308                                  pixman_op_t              op,
5309                                  pixman_image_t *         src_image,
5310                                  pixman_image_t *         mask_image,
5311                                  pixman_image_t *         dst_image,
5312                                  int32_t                  src_x,
5313                                  int32_t                  src_y,
5314                                  int32_t                  mask_x,
5315                                  int32_t                  mask_y,
5316                                  int32_t                  dest_x,
5317                                  int32_t                  dest_y,
5318                                  int32_t                  width,
5319                                  int32_t                  height)
5320 {
5321     uint32_t    *src, *src_line, s;
5322     uint32_t    *dst, *dst_line, d;
5323     uint8_t         *mask, *mask_line;
5324     uint32_t m;
5325     int src_stride, mask_stride, dst_stride;
5326     int32_t w;
5327
5328     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5329     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5330     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5331
5332     PIXMAN_IMAGE_GET_LINE (
5333         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5334     PIXMAN_IMAGE_GET_LINE (
5335         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5336     PIXMAN_IMAGE_GET_LINE (
5337         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5338
5339     while (height--)
5340     {
5341         src = src_line;
5342         src_line += src_stride;
5343         dst = dst_line;
5344         dst_line += dst_stride;
5345         mask = mask_line;
5346         mask_line += mask_stride;
5347
5348         w = width;
5349
5350         while (w && (unsigned long)dst & 15)
5351         {
5352             uint32_t sa;
5353
5354             s = *src++;
5355             m = (uint32_t) *mask++;
5356             d = *dst;
5357
5358             sa = s >> 24;
5359
5360             if (m)
5361             {
5362                 if (sa == 0xff && m == 0xff)
5363                 {
5364                     *dst = s;
5365                 }
5366                 else
5367                 {
5368                     __m64 ms, md, ma, msa;
5369
5370                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5371                     ms = unpack_32_1x64 (s);
5372                     md = unpack_32_1x64 (d);
5373
5374                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5375
5376                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5377                 }
5378             }
5379
5380             dst++;
5381             w--;
5382         }
5383
5384         while (w >= 4)
5385         {
5386             m = *(uint32_t *) mask;
5387
5388             if (m)
5389             {
5390                 xmm_src = load_128_unaligned ((__m128i*)src);
5391
5392                 if (m == 0xffffffff && is_opaque (xmm_src))
5393                 {
5394                     save_128_aligned ((__m128i *)dst, xmm_src);
5395                 }
5396                 else
5397                 {
5398                     xmm_dst = load_128_aligned ((__m128i *)dst);
5399
5400                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5401
5402                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5403                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5404                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5405
5406                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5407                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5408
5409                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5410                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5411
5412                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5413                 }
5414             }
5415
5416             src += 4;
5417             dst += 4;
5418             mask += 4;
5419             w -= 4;
5420         }
5421
5422         while (w)
5423         {
5424             uint32_t sa;
5425
5426             s = *src++;
5427             m = (uint32_t) *mask++;
5428             d = *dst;
5429
5430             sa = s >> 24;
5431
5432             if (m)
5433             {
5434                 if (sa == 0xff && m == 0xff)
5435                 {
5436                     *dst = s;
5437                 }
5438                 else
5439                 {
5440                     __m64 ms, md, ma, msa;
5441
5442                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5443                     ms = unpack_32_1x64 (s);
5444                     md = unpack_32_1x64 (d);
5445
5446                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5447
5448                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5449                 }
5450             }
5451
5452             dst++;
5453             w--;
5454         }
5455     }
5456
5457     _mm_empty ();
5458 }
5459
5460 static void
5461 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5462                                     pixman_op_t              op,
5463                                     pixman_image_t *         src_image,
5464                                     pixman_image_t *         mask_image,
5465                                     pixman_image_t *         dst_image,
5466                                     int32_t                  src_x,
5467                                     int32_t                  src_y,
5468                                     int32_t                  mask_x,
5469                                     int32_t                  mask_y,
5470                                     int32_t                  dest_x,
5471                                     int32_t                  dest_y,
5472                                     int32_t                  width,
5473                                     int32_t                  height)
5474 {
5475     uint32_t src;
5476     uint32_t    *dst_line, *dst;
5477     __m128i xmm_src;
5478     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5479     __m128i xmm_dsta_hi, xmm_dsta_lo;
5480     int dst_stride;
5481     int32_t w;
5482
5483     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5484
5485     if (src == 0)
5486         return;
5487
5488     PIXMAN_IMAGE_GET_LINE (
5489         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5490
5491     xmm_src = expand_pixel_32_1x128 (src);
5492
5493     while (height--)
5494     {
5495         dst = dst_line;
5496
5497         dst_line += dst_stride;
5498         w = width;
5499
5500         while (w && (unsigned long)dst & 15)
5501         {
5502             __m64 vd;
5503
5504             vd = unpack_32_1x64 (*dst);
5505
5506             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5507                                             _mm_movepi64_pi64 (xmm_src)));
5508             w--;
5509             dst++;
5510         }
5511
5512         while (w >= 4)
5513         {
5514             __m128i tmp_lo, tmp_hi;
5515
5516             xmm_dst = load_128_aligned ((__m128i*)dst);
5517
5518             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5519             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5520
5521             tmp_lo = xmm_src;
5522             tmp_hi = xmm_src;
5523
5524             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5525                         &xmm_dsta_lo, &xmm_dsta_hi,
5526                         &tmp_lo, &tmp_hi);
5527
5528             save_128_aligned (
5529                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5530
5531             w -= 4;
5532             dst += 4;
5533         }
5534
5535         while (w)
5536         {
5537             __m64 vd;
5538
5539             vd = unpack_32_1x64 (*dst);
5540
5541             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5542                                             _mm_movepi64_pi64 (xmm_src)));
5543             w--;
5544             dst++;
5545         }
5546
5547     }
5548
5549     _mm_empty ();
5550 }
5551
5552 static void
5553 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5554                                     pixman_op_t              op,
5555                                     pixman_image_t *         src_image,
5556                                     pixman_image_t *         mask_image,
5557                                     pixman_image_t *         dst_image,
5558                                     int32_t                  src_x,
5559                                     int32_t                  src_y,
5560                                     int32_t                  mask_x,
5561                                     int32_t                  mask_y,
5562                                     int32_t                  dest_x,
5563                                     int32_t                  dest_y,
5564                                     int32_t                  width,
5565                                     int32_t                  height)
5566 {
5567     uint32_t    *src, *src_line, s;
5568     uint32_t    *dst, *dst_line, d;
5569     uint32_t    *mask, *mask_line;
5570     uint32_t    m;
5571     int src_stride, mask_stride, dst_stride;
5572     int32_t w;
5573
5574     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5575     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5576     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5577
5578     PIXMAN_IMAGE_GET_LINE (
5579         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5580     PIXMAN_IMAGE_GET_LINE (
5581         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5582     PIXMAN_IMAGE_GET_LINE (
5583         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5584
5585     while (height--)
5586     {
5587         src = src_line;
5588         src_line += src_stride;
5589         dst = dst_line;
5590         dst_line += dst_stride;
5591         mask = mask_line;
5592         mask_line += mask_stride;
5593
5594         w = width;
5595
5596         while (w && (unsigned long)dst & 15)
5597         {
5598             uint32_t sa;
5599
5600             s = *src++;
5601             m = (*mask++) >> 24;
5602             d = *dst;
5603
5604             sa = s >> 24;
5605
5606             if (m)
5607             {
5608                 if (sa == 0xff && m == 0xff)
5609                 {
5610                     *dst = s;
5611                 }
5612                 else
5613                 {
5614                     __m64 ms, md, ma, msa;
5615
5616                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5617                     ms = unpack_32_1x64 (s);
5618                     md = unpack_32_1x64 (d);
5619
5620                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5621
5622                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5623                 }
5624             }
5625
5626             dst++;
5627             w--;
5628         }
5629
5630         while (w >= 4)
5631         {
5632             xmm_mask = load_128_unaligned ((__m128i*)mask);
5633
5634             if (!is_transparent (xmm_mask))
5635             {
5636                 xmm_src = load_128_unaligned ((__m128i*)src);
5637
5638                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5639                 {
5640                     save_128_aligned ((__m128i *)dst, xmm_src);
5641                 }
5642                 else
5643                 {
5644                     xmm_dst = load_128_aligned ((__m128i *)dst);
5645
5646                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5647                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5648                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5649
5650                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5651                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5652
5653                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5654                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5655
5656                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5657                 }
5658             }
5659
5660             src += 4;
5661             dst += 4;
5662             mask += 4;
5663             w -= 4;
5664         }
5665
5666         while (w)
5667         {
5668             uint32_t sa;
5669
5670             s = *src++;
5671             m = (*mask++) >> 24;
5672             d = *dst;
5673
5674             sa = s >> 24;
5675
5676             if (m)
5677             {
5678                 if (sa == 0xff && m == 0xff)
5679                 {
5680                     *dst = s;
5681                 }
5682                 else
5683                 {
5684                     __m64 ms, md, ma, msa;
5685
5686                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5687                     ms = unpack_32_1x64 (s);
5688                     md = unpack_32_1x64 (d);
5689
5690                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5691
5692                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5693                 }
5694             }
5695
5696             dst++;
5697             w--;
5698         }
5699     }
5700
5701     _mm_empty ();
5702 }
5703
5704 /* A variant of 'core_combine_over_u_sse2' with minor tweaks */
5705 static force_inline void
5706 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5707                                              const uint32_t* ps,
5708                                              int32_t         w,
5709                                              pixman_fixed_t  vx,
5710                                              pixman_fixed_t  unit_x,
5711                                              pixman_fixed_t  max_vx)
5712 {
5713     uint32_t s, d;
5714     const uint32_t* pm = NULL;
5715
5716     __m128i xmm_dst_lo, xmm_dst_hi;
5717     __m128i xmm_src_lo, xmm_src_hi;
5718     __m128i xmm_alpha_lo, xmm_alpha_hi;
5719
5720     /* Align dst on a 16-byte boundary */
5721     while (w && ((unsigned long)pd & 15))
5722     {
5723         d = *pd;
5724         s = combine1 (ps + (vx >> 16), pm);
5725         vx += unit_x;
5726
5727         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5728         if (pm)
5729             pm++;
5730         w--;
5731     }
5732
5733     while (w >= 4)
5734     {
5735         __m128i tmp;
5736         uint32_t tmp1, tmp2, tmp3, tmp4;
5737
5738         tmp1 = ps[vx >> 16];
5739         vx += unit_x;
5740         tmp2 = ps[vx >> 16];
5741         vx += unit_x;
5742         tmp3 = ps[vx >> 16];
5743         vx += unit_x;
5744         tmp4 = ps[vx >> 16];
5745         vx += unit_x;
5746
5747         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5748
5749         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5750
5751         if (is_opaque (xmm_src_hi))
5752         {
5753             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5754         }
5755         else if (!is_zero (xmm_src_hi))
5756         {
5757             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5758
5759             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5760             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5761
5762             expand_alpha_2x128 (
5763                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5764
5765             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5766                         &xmm_alpha_lo, &xmm_alpha_hi,
5767                         &xmm_dst_lo, &xmm_dst_hi);
5768
5769             /* rebuid the 4 pixel data and save*/
5770             save_128_aligned ((__m128i*)pd,
5771                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5772         }
5773
5774         w -= 4;
5775         pd += 4;
5776         if (pm)
5777             pm += 4;
5778     }
5779
5780     while (w)
5781     {
5782         d = *pd;
5783         s = combine1 (ps + (vx >> 16), pm);
5784         vx += unit_x;
5785
5786         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5787         if (pm)
5788             pm++;
5789
5790         w--;
5791     }
5792     _mm_empty ();
5793 }
5794
5795 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5796                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5797                        uint32_t, uint32_t, COVER)
5798 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5799                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5800                        uint32_t, uint32_t, NONE)
5801 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5802                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5803                        uint32_t, uint32_t, PAD)
5804
5805 static const pixman_fast_path_t sse2_fast_paths[] =
5806 {
5807     /* PIXMAN_OP_OVER */
5808     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5809     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5810     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5811     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5812     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5813     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5814     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5815     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5816     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5817     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5818     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5819     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5820     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5821     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5822     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5823     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5824     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5825     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5826     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5827     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5828     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5829     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5830     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5831     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5832     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5833     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5834     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5835     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5836     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5837     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5838     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5839     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5840     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5841     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5842     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5843     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5844     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5845     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5846     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5847     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5848     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5849     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5850     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5851     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5852     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5853     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5854     
5855     /* PIXMAN_OP_OVER_REVERSE */
5856     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5857     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5858
5859     /* PIXMAN_OP_ADD */
5860     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5861     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5862     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5863     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5864     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5865     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5866
5867     /* PIXMAN_OP_SRC */
5868     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5869     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5870     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5871     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5872     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5873     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5874     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5875     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5876     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5877     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5878     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5879     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5880     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5881     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5882
5883     /* PIXMAN_OP_IN */
5884     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5885     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5886     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5887
5888     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5889     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5890     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5891     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5892     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5893     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5894     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5895     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5896     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5897     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5898     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5899     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5900
5901     { PIXMAN_OP_NONE },
5902 };
5903
5904 static pixman_bool_t
5905 sse2_blt (pixman_implementation_t *imp,
5906           uint32_t *               src_bits,
5907           uint32_t *               dst_bits,
5908           int                      src_stride,
5909           int                      dst_stride,
5910           int                      src_bpp,
5911           int                      dst_bpp,
5912           int                      src_x,
5913           int                      src_y,
5914           int                      dst_x,
5915           int                      dst_y,
5916           int                      width,
5917           int                      height)
5918 {
5919     if (!pixman_blt_sse2 (
5920             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5921             src_x, src_y, dst_x, dst_y, width, height))
5922
5923     {
5924         return _pixman_implementation_blt (
5925             imp->delegate,
5926             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5927             src_x, src_y, dst_x, dst_y, width, height);
5928     }
5929
5930     return TRUE;
5931 }
5932
5933 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5934 __attribute__((__force_align_arg_pointer__))
5935 #endif
5936 static pixman_bool_t
5937 sse2_fill (pixman_implementation_t *imp,
5938            uint32_t *               bits,
5939            int                      stride,
5940            int                      bpp,
5941            int                      x,
5942            int                      y,
5943            int                      width,
5944            int                      height,
5945            uint32_t xor)
5946 {
5947     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5948     {
5949         return _pixman_implementation_fill (
5950             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5951     }
5952
5953     return TRUE;
5954 }
5955
5956 static uint32_t *
5957 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5958 {
5959     int w = iter->width;
5960     __m128i ff000000 = mask_ff000000;
5961     uint32_t *dst = iter->buffer;
5962     uint32_t *src = (uint32_t *)iter->bits;
5963
5964     iter->bits += iter->stride;
5965
5966     while (w && ((unsigned long)dst) & 0x0f)
5967     {
5968         *dst++ = (*src++) | 0xff000000;
5969         w--;
5970     }
5971
5972     while (w >= 4)
5973     {
5974         save_128_aligned (
5975             (__m128i *)dst, _mm_or_si128 (
5976                 load_128_unaligned ((__m128i *)src), ff000000));
5977
5978         dst += 4;
5979         src += 4;
5980         w -= 4;
5981     }
5982
5983     while (w)
5984     {
5985         *dst++ = (*src++) | 0xff000000;
5986         w--;
5987     }
5988
5989     return iter->buffer;
5990 }
5991
5992 static uint32_t *
5993 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5994 {
5995     int w = iter->width;
5996     uint32_t *dst = iter->buffer;
5997     uint8_t *src = iter->bits;
5998     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5999
6000     iter->bits += iter->stride;
6001
6002     while (w && (((unsigned long)dst) & 15))
6003     {
6004         *dst++ = *(src++) << 24;
6005         w--;
6006     }
6007
6008     while (w >= 16)
6009     {
6010         xmm0 = _mm_loadu_si128((__m128i *)src);
6011
6012         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
6013         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
6014         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6015         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6016         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6017         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6018
6019         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
6020         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
6021         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
6022         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6023
6024         dst += 16;
6025         src += 16;
6026         w -= 16;
6027     }
6028
6029     while (w)
6030     {
6031         *dst++ = *(src++) << 24;
6032         w--;
6033     }
6034
6035     return iter->buffer;
6036 }
6037
6038 typedef struct
6039 {
6040     pixman_format_code_t        format;
6041     pixman_iter_get_scanline_t  get_scanline;
6042 } fetcher_info_t;
6043
6044 static const fetcher_info_t fetchers[] =
6045 {
6046     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
6047     { PIXMAN_a8,                sse2_fetch_a8 },
6048     { PIXMAN_null }
6049 };
6050
6051 static void
6052 sse2_src_iter_init (pixman_implementation_t *imp,
6053                     pixman_iter_t *iter,
6054                     pixman_image_t *image,
6055                     int x, int y, int width, int height,
6056                     uint8_t *buffer, iter_flags_t flags)
6057 {
6058 #define FLAGS                                                           \
6059     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
6060
6061     if ((flags & ITER_NARROW)                           &&
6062         (image->common.flags & FLAGS) == FLAGS          &&
6063         x >= 0 && y >= 0                                &&
6064         x + width <= image->bits.width                  &&
6065         y + height <= image->bits.height)
6066     {
6067         const fetcher_info_t *f;
6068
6069         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6070         {
6071             if (image->common.extended_format_code == f->format)
6072             {
6073                 uint8_t *b = (uint8_t *)image->bits.bits;
6074                 int s = image->bits.rowstride * 4;
6075
6076                 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6077                 iter->stride = s;
6078                 iter->width = width;
6079                 iter->buffer = (uint32_t *)buffer;
6080
6081                 iter->get_scanline = f->get_scanline;
6082                 return;
6083             }
6084         }
6085     }
6086
6087     _pixman_implementation_src_iter_init (
6088         imp->delegate, iter, image, x, y, width, height, buffer, flags);
6089 }
6090
6091 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6092 __attribute__((__force_align_arg_pointer__))
6093 #endif
6094 pixman_implementation_t *
6095 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6096 {
6097     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6098
6099     /* SSE2 constants */
6100     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6101     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6102     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6103     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6104     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6105     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6106     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6107     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6108     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6109     mask_0080 = create_mask_16_128 (0x0080);
6110     mask_00ff = create_mask_16_128 (0x00ff);
6111     mask_0101 = create_mask_16_128 (0x0101);
6112     mask_ffff = create_mask_16_128 (0xffff);
6113     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6114     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6115
6116     /* MMX constants */
6117     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
6118     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
6119
6120     mask_x0080 = create_mask_16_64 (0x0080);
6121     mask_x00ff = create_mask_16_64 (0x00ff);
6122     mask_x0101 = create_mask_16_64 (0x0101);
6123     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
6124
6125     _mm_empty ();
6126
6127     /* Set up function pointers */
6128
6129     /* SSE code patch for fbcompose.c */
6130     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6131     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6132     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6133     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6134     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6135     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6136     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6137     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6138     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6139     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6140
6141     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6142
6143     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6144     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6145     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6146     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6147     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6148     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6149     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6150     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6151     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6152     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6153     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6154
6155     imp->blt = sse2_blt;
6156     imp->fill = sse2_fill;
6157
6158     imp->src_iter_init = sse2_src_iter_init;
6159
6160     return imp;
6161 }
6162
6163 #endif /* USE_SSE2 */