pixman/pixman-sse2.c

   1 /*
   2  * Copyright © 2008 Rodrigo Kumpera
   3  * Copyright © 2008 André Tupinambá
   4  *
   5  * Permission to use, copy, modify, distribute, and sell this software and its
   6  * documentation for any purpose is hereby granted without fee, provided that
   7  * the above copyright notice appear in all copies and that both that
   8  * copyright notice and this permission notice appear in supporting
   9  * documentation, and that the name of Red Hat not be used in advertising or
  10  * publicity pertaining to distribution of the software without specific,
  11  * written prior permission.  Red Hat makes no representations about the
  12  * suitability of this software for any purpose.  It is provided "as is"
  13  * without express or implied warranty.
  14  *
  15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  22  * SOFTWARE.
  23  *
  24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
  25  *          André Tupinambá (andrelrt@gmail.com)
  26  *
  27  * Based on work by Owen Taylor and Søren Sandmann
  28  */
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include <mmintrin.h>
  34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
  35 #include <emmintrin.h> /* for SSE2 intrinsics */
  36 #include "pixman-private.h"
  37 #include "pixman-combine32.h"
  38 #include "pixman-fast-path.h"
  39
  40 #if defined(_MSC_VER) && defined(_M_AMD64)
  41 /* Windows 64 doesn't allow MMX to be used, so
  42  * the pixman-x64-mmx-emulation.h file contains
  43  * implementations of those MMX intrinsics that
  44  * are used in the SSE2 implementation.
  45  */
  46 #   include "pixman-x64-mmx-emulation.h"
  47 #endif
  48
  49 #ifdef USE_SSE2
  50
  51 /* --------------------------------------------------------------------
  52  * Locals
  53  */
  54
  55 static __m64 mask_x0080;
  56 static __m64 mask_x00ff;
  57 static __m64 mask_x0101;
  58 static __m64 mask_x_alpha;
  59
  60 static __m64 mask_x565_rgb;
  61 static __m64 mask_x565_unpack;
  62
  63 static __m128i mask_0080;
  64 static __m128i mask_00ff;
  65 static __m128i mask_0101;
  66 static __m128i mask_ffff;
  67 static __m128i mask_ff000000;
  68 static __m128i mask_alpha;
  69
  70 static __m128i mask_565_r;
  71 static __m128i mask_565_g1, mask_565_g2;
  72 static __m128i mask_565_b;
  73 static __m128i mask_red;
  74 static __m128i mask_green;
  75 static __m128i mask_blue;
  76
  77 static __m128i mask_565_fix_rb;
  78 static __m128i mask_565_fix_g;
  79
  80 /* ----------------------------------------------------------------------
  81  * SSE2 Inlines
  82  */
  83 static force_inline __m128i
  84 unpack_32_1x128 (uint32_t data)
  85 {
  86     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
  87 }
  88
  89 static force_inline void
  90 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
  91 {
  92     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
  93     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
  94 }
  95
  96 static force_inline __m128i
  97 unpack_565_to_8888 (__m128i lo)
  98 {
  99     __m128i r, g, b, rb, t;
 100
 101     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
 102     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
 103     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
 104
 105     rb = _mm_or_si128 (r, b);
 106     t  = _mm_and_si128 (rb, mask_565_fix_rb);
 107     t  = _mm_srli_epi32 (t, 5);
 108     rb = _mm_or_si128 (rb, t);
 109
 110     t  = _mm_and_si128 (g, mask_565_fix_g);
 111     t  = _mm_srli_epi32 (t, 6);
 112     g  = _mm_or_si128 (g, t);
 113
 114     return _mm_or_si128 (rb, g);
 115 }
 116
 117 static force_inline void
 118 unpack_565_128_4x128 (__m128i  data,
 119                       __m128i* data0,
 120                       __m128i* data1,
 121                       __m128i* data2,
 122                       __m128i* data3)
 123 {
 124     __m128i lo, hi;
 125
 126     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
 127     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
 128
 129     lo = unpack_565_to_8888 (lo);
 130     hi = unpack_565_to_8888 (hi);
 131
 132     unpack_128_2x128 (lo, data0, data1);
 133     unpack_128_2x128 (hi, data2, data3);
 134 }
 135
 136 static force_inline uint16_t
 137 pack_565_32_16 (uint32_t pixel)
 138 {
 139     return (uint16_t) (((pixel >> 8) & 0xf800) |
 140                        ((pixel >> 5) & 0x07e0) |
 141                        ((pixel >> 3) & 0x001f));
 142 }
 143
 144 static force_inline __m128i
 145 pack_2x128_128 (__m128i lo, __m128i hi)
 146 {
 147     return _mm_packus_epi16 (lo, hi);
 148 }
 149
 150 static force_inline __m128i
 151 pack_565_2x128_128 (__m128i lo, __m128i hi)
 152 {
 153     __m128i data;
 154     __m128i r, g1, g2, b;
 155
 156     data = pack_2x128_128 (lo, hi);
 157
 158     r  = _mm_and_si128 (data, mask_565_r);
 159     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
 160     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
 161     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
 162
 163     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
 164 }
 165
 166 static force_inline __m128i
 167 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
 168 {
 169     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
 170                              pack_565_2x128_128 (*xmm2, *xmm3));
 171 }
 172
 173 static force_inline int
 174 is_opaque (__m128i x)
 175 {
 176     __m128i ffs = _mm_cmpeq_epi8 (x, x);
 177
 178     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
 179 }
 180
 181 static force_inline int
 182 is_zero (__m128i x)
 183 {
 184     return _mm_movemask_epi8 (
 185         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
 186 }
 187
 188 static force_inline int
 189 is_transparent (__m128i x)
 190 {
 191     return (_mm_movemask_epi8 (
 192                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
 193 }
 194
 195 static force_inline __m128i
 196 expand_pixel_32_1x128 (uint32_t data)
 197 {
 198     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
 199 }
 200
 201 static force_inline __m128i
 202 expand_alpha_1x128 (__m128i data)
 203 {
 204     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
 205                                                      _MM_SHUFFLE (3, 3, 3, 3)),
 206                                 _MM_SHUFFLE (3, 3, 3, 3));
 207 }
 208
 209 static force_inline void
 210 expand_alpha_2x128 (__m128i  data_lo,
 211                     __m128i  data_hi,
 212                     __m128i* alpha_lo,
 213                     __m128i* alpha_hi)
 214 {
 215     __m128i lo, hi;
 216
 217     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
 218     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
 219
 220     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
 221     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
 222 }
 223
 224 static force_inline void
 225 expand_alpha_rev_2x128 (__m128i  data_lo,
 226                         __m128i  data_hi,
 227                         __m128i* alpha_lo,
 228                         __m128i* alpha_hi)
 229 {
 230     __m128i lo, hi;
 231
 232     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
 233     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
 234     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
 235     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
 236 }
 237
 238 static force_inline void
 239 pix_multiply_2x128 (__m128i* data_lo,
 240                     __m128i* data_hi,
 241                     __m128i* alpha_lo,
 242                     __m128i* alpha_hi,
 243                     __m128i* ret_lo,
 244                     __m128i* ret_hi)
 245 {
 246     __m128i lo, hi;
 247
 248     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
 249     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
 250     lo = _mm_adds_epu16 (lo, mask_0080);
 251     hi = _mm_adds_epu16 (hi, mask_0080);
 252     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
 253     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
 254 }
 255
 256 static force_inline void
 257 pix_add_multiply_2x128 (__m128i* src_lo,
 258                         __m128i* src_hi,
 259                         __m128i* alpha_dst_lo,
 260                         __m128i* alpha_dst_hi,
 261                         __m128i* dst_lo,
 262                         __m128i* dst_hi,
 263                         __m128i* alpha_src_lo,
 264                         __m128i* alpha_src_hi,
 265                         __m128i* ret_lo,
 266                         __m128i* ret_hi)
 267 {
 268     __m128i t1_lo, t1_hi;
 269     __m128i t2_lo, t2_hi;
 270
 271     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
 272     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
 273
 274     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
 275     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
 276 }
 277
 278 static force_inline void
 279 negate_2x128 (__m128i  data_lo,
 280               __m128i  data_hi,
 281               __m128i* neg_lo,
 282               __m128i* neg_hi)
 283 {
 284     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
 285     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
 286 }
 287
 288 static force_inline void
 289 invert_colors_2x128 (__m128i  data_lo,
 290                      __m128i  data_hi,
 291                      __m128i* inv_lo,
 292                      __m128i* inv_hi)
 293 {
 294     __m128i lo, hi;
 295
 296     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
 297     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
 298     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
 299     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
 300 }
 301
 302 static force_inline void
 303 over_2x128 (__m128i* src_lo,
 304             __m128i* src_hi,
 305             __m128i* alpha_lo,
 306             __m128i* alpha_hi,
 307             __m128i* dst_lo,
 308             __m128i* dst_hi)
 309 {
 310     __m128i t1, t2;
 311
 312     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
 313
 314     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
 315
 316     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
 317     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
 318 }
 319
 320 static force_inline void
 321 over_rev_non_pre_2x128 (__m128i  src_lo,
 322                         __m128i  src_hi,
 323                         __m128i* dst_lo,
 324                         __m128i* dst_hi)
 325 {
 326     __m128i lo, hi;
 327     __m128i alpha_lo, alpha_hi;
 328
 329     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
 330
 331     lo = _mm_or_si128 (alpha_lo, mask_alpha);
 332     hi = _mm_or_si128 (alpha_hi, mask_alpha);
 333
 334     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
 335
 336     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
 337
 338     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
 339 }
 340
 341 static force_inline void
 342 in_over_2x128 (__m128i* src_lo,
 343                __m128i* src_hi,
 344                __m128i* alpha_lo,
 345                __m128i* alpha_hi,
 346                __m128i* mask_lo,
 347                __m128i* mask_hi,
 348                __m128i* dst_lo,
 349                __m128i* dst_hi)
 350 {
 351     __m128i s_lo, s_hi;
 352     __m128i a_lo, a_hi;
 353
 354     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
 355     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
 356
 357     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 358 }
 359
 360 /* load 4 pixels from a 16-byte boundary aligned address */
 361 static force_inline __m128i
 362 load_128_aligned (__m128i* src)
 363 {
 364     return _mm_load_si128 (src);
 365 }
 366
 367 /* load 4 pixels from a unaligned address */
 368 static force_inline __m128i
 369 load_128_unaligned (const __m128i* src)
 370 {
 371     return _mm_loadu_si128 (src);
 372 }
 373
 374 /* save 4 pixels using Write Combining memory on a 16-byte
 375  * boundary aligned address
 376  */
 377 static force_inline void
 378 save_128_write_combining (__m128i* dst,
 379                           __m128i  data)
 380 {
 381     _mm_stream_si128 (dst, data);
 382 }
 383
 384 /* save 4 pixels on a 16-byte boundary aligned address */
 385 static force_inline void
 386 save_128_aligned (__m128i* dst,
 387                   __m128i  data)
 388 {
 389     _mm_store_si128 (dst, data);
 390 }
 391
 392 /* save 4 pixels on a unaligned address */
 393 static force_inline void
 394 save_128_unaligned (__m128i* dst,
 395                     __m128i  data)
 396 {
 397     _mm_storeu_si128 (dst, data);
 398 }
 399
 400 /* ------------------------------------------------------------------
 401  * MMX inlines
 402  */
 403
 404 static force_inline __m64
 405 load_32_1x64 (uint32_t data)
 406 {
 407     return _mm_cvtsi32_si64 (data);
 408 }
 409
 410 static force_inline __m64
 411 unpack_32_1x64 (uint32_t data)
 412 {
 413     return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
 414 }
 415
 416 static force_inline __m64
 417 expand_alpha_1x64 (__m64 data)
 418 {
 419     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
 420 }
 421
 422 static force_inline __m64
 423 expand_alpha_rev_1x64 (__m64 data)
 424 {
 425     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 426 }
 427
 428 static force_inline __m64
 429 expand_pixel_8_1x64 (uint8_t data)
 430 {
 431     return _mm_shuffle_pi16 (
 432         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 433 }
 434
 435 static force_inline __m64
 436 pix_multiply_1x64 (__m64 data,
 437                    __m64 alpha)
 438 {
 439     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
 440                                           mask_x0080),
 441                            mask_x0101);
 442 }
 443
 444 static force_inline __m64
 445 pix_add_multiply_1x64 (__m64* src,
 446                        __m64* alpha_dst,
 447                        __m64* dst,
 448                        __m64* alpha_src)
 449 {
 450     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
 451     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
 452
 453     return _mm_adds_pu8 (t1, t2);
 454 }
 455
 456 static force_inline __m64
 457 negate_1x64 (__m64 data)
 458 {
 459     return _mm_xor_si64 (data, mask_x00ff);
 460 }
 461
 462 static force_inline __m64
 463 invert_colors_1x64 (__m64 data)
 464 {
 465     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 466 }
 467
 468 static force_inline __m64
 469 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
 470 {
 471     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
 472 }
 473
 474 static force_inline __m64
 475 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
 476 {
 477     return over_1x64 (pix_multiply_1x64 (*src, *mask),
 478                       pix_multiply_1x64 (*alpha, *mask),
 479                       *dst);
 480 }
 481
 482 static force_inline __m64
 483 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
 484 {
 485     __m64 alpha = expand_alpha_1x64 (src);
 486
 487     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
 488                                          _mm_or_si64 (alpha, mask_x_alpha)),
 489                       alpha,
 490                       dst);
 491 }
 492
 493 static force_inline uint32_t
 494 pack_1x64_32 (__m64 data)
 495 {
 496     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
 497 }
 498
 499 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
 500  *
 501  *    00RR00GG00BB
 502  *
 503  * --- Expanding 565 in the low word ---
 504  *
 505  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
 506  * m = m & (01f0003f001f);
 507  * m = m * (008404100840);
 508  * m = m >> 8;
 509  *
 510  * Note the trick here - the top word is shifted by another nibble to
 511  * avoid it bumping into the middle word
 512  */
 513 static force_inline __m64
 514 expand565_16_1x64 (uint16_t pixel)
 515 {
 516     __m64 p;
 517     __m64 t1, t2;
 518
 519     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
 520
 521     t1 = _mm_slli_si64 (p, 36 - 11);
 522     t2 = _mm_slli_si64 (p, 16 - 5);
 523
 524     p = _mm_or_si64 (t1, p);
 525     p = _mm_or_si64 (t2, p);
 526     p = _mm_and_si64 (p, mask_x565_rgb);
 527     p = _mm_mullo_pi16 (p, mask_x565_unpack);
 528
 529     return _mm_srli_pi16 (p, 8);
 530 }
 531
 532 /* ----------------------------------------------------------------------------
 533  * Compose Core transformations
 534  */
 535 static force_inline uint32_t
 536 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 537 {
 538     uint8_t a;
 539     __m64 ms;
 540
 541     a = src >> 24;
 542
 543     if (a == 0xff)
 544     {
 545         return src;
 546     }
 547     else if (src)
 548     {
 549         ms = unpack_32_1x64 (src);
 550         return pack_1x64_32 (
 551             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
 552     }
 553
 554     return dst;
 555 }
 556
 557 static force_inline uint32_t
 558 combine1 (const uint32_t *ps, const uint32_t *pm)
 559 {
 560     uint32_t s = *ps;
 561
 562     if (pm)
 563     {
 564         __m64 ms, mm;
 565
 566         mm = unpack_32_1x64 (*pm);
 567         mm = expand_alpha_1x64 (mm);
 568
 569         ms = unpack_32_1x64 (s);
 570         ms = pix_multiply_1x64 (ms, mm);
 571
 572         s = pack_1x64_32 (ms);
 573     }
 574
 575     return s;
 576 }
 577
 578 static force_inline __m128i
 579 combine4 (const __m128i *ps, const __m128i *pm)
 580 {
 581     __m128i xmm_src_lo, xmm_src_hi;
 582     __m128i xmm_msk_lo, xmm_msk_hi;
 583     __m128i s;
 584
 585     if (pm)
 586     {
 587         xmm_msk_lo = load_128_unaligned (pm);
 588
 589         if (is_transparent (xmm_msk_lo))
 590             return _mm_setzero_si128 ();
 591     }
 592
 593     s = load_128_unaligned (ps);
 594
 595     if (pm)
 596     {
 597         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
 598         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
 599
 600         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
 601
 602         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
 603                             &xmm_msk_lo, &xmm_msk_hi,
 604                             &xmm_src_lo, &xmm_src_hi);
 605
 606         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
 607     }
 608
 609     return s;
 610 }
 611
 612 static force_inline void
 613 core_combine_over_u_sse2_mask (uint32_t *         pd,
 614                                const uint32_t*    ps,
 615                                const uint32_t*    pm,
 616                                int                w)
 617 {
 618     uint32_t s, d;
 619
 620     /* Align dst on a 16-byte boundary */
 621     while (w && ((unsigned long)pd & 15))
 622     {
 623         d = *pd;
 624         s = combine1 (ps, pm);
 625
 626         if (s)
 627             *pd = core_combine_over_u_pixel_sse2 (s, d);
 628         pd++;
 629         ps++;
 630         pm++;
 631         w--;
 632     }
 633
 634     while (w >= 4)
 635     {
 636         __m128i mask = load_128_unaligned ((__m128i *)pm);
 637
 638         if (!is_zero (mask))
 639         {
 640             __m128i src;
 641             __m128i src_hi, src_lo;
 642             __m128i mask_hi, mask_lo;
 643             __m128i alpha_hi, alpha_lo;
 644
 645             src = load_128_unaligned ((__m128i *)ps);
 646
 647             if (is_opaque (_mm_and_si128 (src, mask)))
 648             {
 649                 save_128_aligned ((__m128i *)pd, src);
 650             }
 651             else
 652             {
 653                 __m128i dst = load_128_aligned ((__m128i *)pd);
 654                 __m128i dst_hi, dst_lo;
 655
 656                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
 657                 unpack_128_2x128 (src, &src_lo, &src_hi);
 658
 659                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
 660                 pix_multiply_2x128 (&src_lo, &src_hi,
 661                                     &mask_lo, &mask_hi,
 662                                     &src_lo, &src_hi);
 663
 664                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
 665
 666                 expand_alpha_2x128 (src_lo, src_hi,
 667                                     &alpha_lo, &alpha_hi);
 668
 669                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
 670                             &dst_lo, &dst_hi);
 671
 672                 save_128_aligned (
 673                     (__m128i *)pd,
 674                     pack_2x128_128 (dst_lo, dst_hi));
 675             }
 676         }
 677
 678         pm += 4;
 679         ps += 4;
 680         pd += 4;
 681         w -= 4;
 682     }
 683     while (w)
 684     {
 685         d = *pd;
 686         s = combine1 (ps, pm);
 687
 688         if (s)
 689             *pd = core_combine_over_u_pixel_sse2 (s, d);
 690         pd++;
 691         ps++;
 692         pm++;
 693
 694         w--;
 695     }
 696 }
 697
 698 static force_inline void
 699 core_combine_over_u_sse2_no_mask (uint32_t *      pd,
 700                                   const uint32_t*    ps,
 701                                   int                w)
 702 {
 703     uint32_t s, d;
 704
 705     /* Align dst on a 16-byte boundary */
 706     while (w && ((unsigned long)pd & 15))
 707     {
 708         d = *pd;
 709         s = *ps;
 710
 711         if (s)
 712             *pd = core_combine_over_u_pixel_sse2 (s, d);
 713         pd++;
 714         ps++;
 715         w--;
 716     }
 717
 718     while (w >= 4)
 719     {
 720         __m128i src;
 721         __m128i src_hi, src_lo, dst_hi, dst_lo;
 722         __m128i alpha_hi, alpha_lo;
 723
 724         src = load_128_unaligned ((__m128i *)ps);
 725
 726         if (!is_zero (src))
 727         {
 728             if (is_opaque (src))
 729             {
 730                 save_128_aligned ((__m128i *)pd, src);
 731             }
 732             else
 733             {
 734                 __m128i dst = load_128_aligned ((__m128i *)pd);
 735
 736                 unpack_128_2x128 (src, &src_lo, &src_hi);
 737                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
 738
 739                 expand_alpha_2x128 (src_lo, src_hi,
 740                                     &alpha_lo, &alpha_hi);
 741                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
 742                             &dst_lo, &dst_hi);
 743
 744                 save_128_aligned (
 745                     (__m128i *)pd,
 746                     pack_2x128_128 (dst_lo, dst_hi));
 747             }
 748         }
 749
 750         ps += 4;
 751         pd += 4;
 752         w -= 4;
 753     }
 754     while (w)
 755     {
 756         d = *pd;
 757         s = *ps;
 758
 759         if (s)
 760             *pd = core_combine_over_u_pixel_sse2 (s, d);
 761         pd++;
 762         ps++;
 763
 764         w--;
 765     }
 766 }
 767
 768 static force_inline void
 769 core_combine_over_u_sse2 (uint32_t*       pd,
 770                           const uint32_t* ps,
 771                           const uint32_t* pm,
 772                           int             w)
 773 {
 774     if (pm)
 775         core_combine_over_u_sse2_mask (pd, ps, pm, w);
 776     else
 777         core_combine_over_u_sse2_no_mask (pd, ps, w);
 778 }
 779
 780 static force_inline void
 781 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
 782                                   const uint32_t* ps,
 783                                   const uint32_t* pm,
 784                                   int             w)
 785 {
 786     uint32_t s, d;
 787
 788     __m128i xmm_dst_lo, xmm_dst_hi;
 789     __m128i xmm_src_lo, xmm_src_hi;
 790     __m128i xmm_alpha_lo, xmm_alpha_hi;
 791
 792     /* Align dst on a 16-byte boundary */
 793     while (w &&
 794            ((unsigned long)pd & 15))
 795     {
 796         d = *pd;
 797         s = combine1 (ps, pm);
 798
 799         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
 800         w--;
 801         ps++;
 802         if (pm)
 803             pm++;
 804     }
 805
 806     while (w >= 4)
 807     {
 808         /* I'm loading unaligned because I'm not sure
 809          * about the address alignment.
 810          */
 811         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 812         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 813
 814         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 815         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 816
 817         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
 818                             &xmm_alpha_lo, &xmm_alpha_hi);
 819
 820         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 821                     &xmm_alpha_lo, &xmm_alpha_hi,
 822                     &xmm_src_lo, &xmm_src_hi);
 823
 824         /* rebuid the 4 pixel data and save*/
 825         save_128_aligned ((__m128i*)pd,
 826                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
 827
 828         w -= 4;
 829         ps += 4;
 830         pd += 4;
 831
 832         if (pm)
 833             pm += 4;
 834     }
 835
 836     while (w)
 837     {
 838         d = *pd;
 839         s = combine1 (ps, pm);
 840
 841         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
 842         ps++;
 843         w--;
 844         if (pm)
 845             pm++;
 846     }
 847 }
 848
 849 static force_inline uint32_t
 850 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
 851 {
 852     uint32_t maska = src >> 24;
 853
 854     if (maska == 0)
 855     {
 856         return 0;
 857     }
 858     else if (maska != 0xff)
 859     {
 860         return pack_1x64_32 (
 861             pix_multiply_1x64 (unpack_32_1x64 (dst),
 862                                expand_alpha_1x64 (unpack_32_1x64 (src))));
 863     }
 864
 865     return dst;
 866 }
 867
 868 static force_inline void
 869 core_combine_in_u_sse2 (uint32_t*       pd,
 870                         const uint32_t* ps,
 871                         const uint32_t* pm,
 872                         int             w)
 873 {
 874     uint32_t s, d;
 875
 876     __m128i xmm_src_lo, xmm_src_hi;
 877     __m128i xmm_dst_lo, xmm_dst_hi;
 878
 879     while (w && ((unsigned long) pd & 15))
 880     {
 881         s = combine1 (ps, pm);
 882         d = *pd;
 883
 884         *pd++ = core_combine_in_u_pixelsse2 (d, s);
 885         w--;
 886         ps++;
 887         if (pm)
 888             pm++;
 889     }
 890
 891     while (w >= 4)
 892     {
 893         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 894         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 895
 896         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 897         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 898
 899         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 900         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
 901                             &xmm_dst_lo, &xmm_dst_hi,
 902                             &xmm_dst_lo, &xmm_dst_hi);
 903
 904         save_128_aligned ((__m128i*)pd,
 905                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 906
 907         ps += 4;
 908         pd += 4;
 909         w -= 4;
 910         if (pm)
 911             pm += 4;
 912     }
 913
 914     while (w)
 915     {
 916         s = combine1 (ps, pm);
 917         d = *pd;
 918
 919         *pd++ = core_combine_in_u_pixelsse2 (d, s);
 920         w--;
 921         ps++;
 922         if (pm)
 923             pm++;
 924     }
 925 }
 926
 927 static force_inline void
 928 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
 929                                 const uint32_t* ps,
 930                                 const uint32_t *pm,
 931                                 int             w)
 932 {
 933     uint32_t s, d;
 934
 935     __m128i xmm_src_lo, xmm_src_hi;
 936     __m128i xmm_dst_lo, xmm_dst_hi;
 937
 938     while (w && ((unsigned long) pd & 15))
 939     {
 940         s = combine1 (ps, pm);
 941         d = *pd;
 942
 943         *pd++ = core_combine_in_u_pixelsse2 (s, d);
 944         ps++;
 945         w--;
 946         if (pm)
 947             pm++;
 948     }
 949
 950     while (w >= 4)
 951     {
 952         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 953         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 954
 955         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 956         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 957
 958         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 959         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 960                             &xmm_src_lo, &xmm_src_hi,
 961                             &xmm_dst_lo, &xmm_dst_hi);
 962
 963         save_128_aligned (
 964             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 965
 966         ps += 4;
 967         pd += 4;
 968         w -= 4;
 969         if (pm)
 970             pm += 4;
 971     }
 972
 973     while (w)
 974     {
 975         s = combine1 (ps, pm);
 976         d = *pd;
 977
 978         *pd++ = core_combine_in_u_pixelsse2 (s, d);
 979         w--;
 980         ps++;
 981         if (pm)
 982             pm++;
 983     }
 984 }
 985
 986 static force_inline void
 987 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 988                                  const uint32_t* ps,
 989                                  const uint32_t* pm,
 990                                  int             w)
 991 {
 992     while (w && ((unsigned long) pd & 15))
 993     {
 994         uint32_t s = combine1 (ps, pm);
 995         uint32_t d = *pd;
 996
 997         *pd++ = pack_1x64_32 (
 998             pix_multiply_1x64 (
 999                 unpack_32_1x64 (d), negate_1x64 (
1000                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1001
1002         if (pm)
1003             pm++;
1004         ps++;
1005         w--;
1006     }
1007
1008     while (w >= 4)
1009     {
1010         __m128i xmm_src_lo, xmm_src_hi;
1011         __m128i xmm_dst_lo, xmm_dst_hi;
1012
1013         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1014         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1015
1016         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1017         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1018
1019         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1020         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1021
1022         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1023                             &xmm_src_lo, &xmm_src_hi,
1024                             &xmm_dst_lo, &xmm_dst_hi);
1025
1026         save_128_aligned (
1027             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1028
1029         ps += 4;
1030         pd += 4;
1031         if (pm)
1032             pm += 4;
1033
1034         w -= 4;
1035     }
1036
1037     while (w)
1038     {
1039         uint32_t s = combine1 (ps, pm);
1040         uint32_t d = *pd;
1041
1042         *pd++ = pack_1x64_32 (
1043             pix_multiply_1x64 (
1044                 unpack_32_1x64 (d), negate_1x64 (
1045                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1046         ps++;
1047         if (pm)
1048             pm++;
1049         w--;
1050     }
1051 }
1052
1053 static force_inline void
1054 core_combine_out_u_sse2 (uint32_t*       pd,
1055                          const uint32_t* ps,
1056                          const uint32_t* pm,
1057                          int             w)
1058 {
1059     while (w && ((unsigned long) pd & 15))
1060     {
1061         uint32_t s = combine1 (ps, pm);
1062         uint32_t d = *pd;
1063
1064         *pd++ = pack_1x64_32 (
1065             pix_multiply_1x64 (
1066                 unpack_32_1x64 (s), negate_1x64 (
1067                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1068         w--;
1069         ps++;
1070         if (pm)
1071             pm++;
1072     }
1073
1074     while (w >= 4)
1075     {
1076         __m128i xmm_src_lo, xmm_src_hi;
1077         __m128i xmm_dst_lo, xmm_dst_hi;
1078
1079         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1080         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1081
1082         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1083         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1084
1085         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1086         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1087
1088         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1089                             &xmm_dst_lo, &xmm_dst_hi,
1090                             &xmm_dst_lo, &xmm_dst_hi);
1091
1092         save_128_aligned (
1093             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1094
1095         ps += 4;
1096         pd += 4;
1097         w -= 4;
1098         if (pm)
1099             pm += 4;
1100     }
1101
1102     while (w)
1103     {
1104         uint32_t s = combine1 (ps, pm);
1105         uint32_t d = *pd;
1106
1107         *pd++ = pack_1x64_32 (
1108             pix_multiply_1x64 (
1109                 unpack_32_1x64 (s), negate_1x64 (
1110                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1111         w--;
1112         ps++;
1113         if (pm)
1114             pm++;
1115     }
1116 }
1117
1118 static force_inline uint32_t
1119 core_combine_atop_u_pixel_sse2 (uint32_t src,
1120                                 uint32_t dst)
1121 {
1122     __m64 s = unpack_32_1x64 (src);
1123     __m64 d = unpack_32_1x64 (dst);
1124
1125     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1126     __m64 da = expand_alpha_1x64 (d);
1127
1128     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1129 }
1130
1131 static force_inline void
1132 core_combine_atop_u_sse2 (uint32_t*       pd,
1133                           const uint32_t* ps,
1134                           const uint32_t* pm,
1135                           int             w)
1136 {
1137     uint32_t s, d;
1138
1139     __m128i xmm_src_lo, xmm_src_hi;
1140     __m128i xmm_dst_lo, xmm_dst_hi;
1141     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1142     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1143
1144     while (w && ((unsigned long) pd & 15))
1145     {
1146         s = combine1 (ps, pm);
1147         d = *pd;
1148
1149         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1150         w--;
1151         ps++;
1152         if (pm)
1153             pm++;
1154     }
1155
1156     while (w >= 4)
1157     {
1158         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1159         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1160
1161         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1162         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1163
1164         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1165                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1166         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1167                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1168
1169         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1170                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1171
1172         pix_add_multiply_2x128 (
1173             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1174             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1175             &xmm_dst_lo, &xmm_dst_hi);
1176
1177         save_128_aligned (
1178             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1179
1180         ps += 4;
1181         pd += 4;
1182         w -= 4;
1183         if (pm)
1184             pm += 4;
1185     }
1186
1187     while (w)
1188     {
1189         s = combine1 (ps, pm);
1190         d = *pd;
1191
1192         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1193         w--;
1194         ps++;
1195         if (pm)
1196             pm++;
1197     }
1198 }
1199
1200 static force_inline uint32_t
1201 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1202                                         uint32_t dst)
1203 {
1204     __m64 s = unpack_32_1x64 (src);
1205     __m64 d = unpack_32_1x64 (dst);
1206
1207     __m64 sa = expand_alpha_1x64 (s);
1208     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1209
1210     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1211 }
1212
1213 static force_inline void
1214 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1215                                   const uint32_t* ps,
1216                                   const uint32_t* pm,
1217                                   int             w)
1218 {
1219     uint32_t s, d;
1220
1221     __m128i xmm_src_lo, xmm_src_hi;
1222     __m128i xmm_dst_lo, xmm_dst_hi;
1223     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1224     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1225
1226     while (w && ((unsigned long) pd & 15))
1227     {
1228         s = combine1 (ps, pm);
1229         d = *pd;
1230
1231         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1232         ps++;
1233         w--;
1234         if (pm)
1235             pm++;
1236     }
1237
1238     while (w >= 4)
1239     {
1240         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1241         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1242
1243         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1244         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1245
1246         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1247                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1248         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1249                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1250
1251         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1252                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1253
1254         pix_add_multiply_2x128 (
1255             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1256             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1257             &xmm_dst_lo, &xmm_dst_hi);
1258
1259         save_128_aligned (
1260             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1261
1262         ps += 4;
1263         pd += 4;
1264         w -= 4;
1265         if (pm)
1266             pm += 4;
1267     }
1268
1269     while (w)
1270     {
1271         s = combine1 (ps, pm);
1272         d = *pd;
1273
1274         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1275         ps++;
1276         w--;
1277         if (pm)
1278             pm++;
1279     }
1280 }
1281
1282 static force_inline uint32_t
1283 core_combine_xor_u_pixel_sse2 (uint32_t src,
1284                                uint32_t dst)
1285 {
1286     __m64 s = unpack_32_1x64 (src);
1287     __m64 d = unpack_32_1x64 (dst);
1288
1289     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1290     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1291
1292     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1293 }
1294
1295 static force_inline void
1296 core_combine_xor_u_sse2 (uint32_t*       dst,
1297                          const uint32_t* src,
1298                          const uint32_t *mask,
1299                          int             width)
1300 {
1301     int w = width;
1302     uint32_t s, d;
1303     uint32_t* pd = dst;
1304     const uint32_t* ps = src;
1305     const uint32_t* pm = mask;
1306
1307     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1308     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1309     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1310     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1311
1312     while (w && ((unsigned long) pd & 15))
1313     {
1314         s = combine1 (ps, pm);
1315         d = *pd;
1316
1317         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1318         w--;
1319         ps++;
1320         if (pm)
1321             pm++;
1322     }
1323
1324     while (w >= 4)
1325     {
1326         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1327         xmm_dst = load_128_aligned ((__m128i*) pd);
1328
1329         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1330         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1331
1332         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1333                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1334         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1335                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1336
1337         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1338                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1339         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1340                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1341
1342         pix_add_multiply_2x128 (
1343             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1344             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1345             &xmm_dst_lo, &xmm_dst_hi);
1346
1347         save_128_aligned (
1348             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1349
1350         ps += 4;
1351         pd += 4;
1352         w -= 4;
1353         if (pm)
1354             pm += 4;
1355     }
1356
1357     while (w)
1358     {
1359         s = combine1 (ps, pm);
1360         d = *pd;
1361
1362         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1363         w--;
1364         ps++;
1365         if (pm)
1366             pm++;
1367     }
1368 }
1369
1370 static force_inline void
1371 core_combine_add_u_sse2 (uint32_t*       dst,
1372                          const uint32_t* src,
1373                          const uint32_t* mask,
1374                          int             width)
1375 {
1376     int w = width;
1377     uint32_t s, d;
1378     uint32_t* pd = dst;
1379     const uint32_t* ps = src;
1380     const uint32_t* pm = mask;
1381
1382     while (w && (unsigned long)pd & 15)
1383     {
1384         s = combine1 (ps, pm);
1385         d = *pd;
1386
1387         ps++;
1388         if (pm)
1389             pm++;
1390         *pd++ = _mm_cvtsi64_si32 (
1391             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1392         w--;
1393     }
1394
1395     while (w >= 4)
1396     {
1397         __m128i s;
1398
1399         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1400
1401         save_128_aligned (
1402             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1403
1404         pd += 4;
1405         ps += 4;
1406         if (pm)
1407             pm += 4;
1408         w -= 4;
1409     }
1410
1411     while (w--)
1412     {
1413         s = combine1 (ps, pm);
1414         d = *pd;
1415
1416         ps++;
1417         *pd++ = _mm_cvtsi64_si32 (
1418             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1419         if (pm)
1420             pm++;
1421     }
1422 }
1423
1424 static force_inline uint32_t
1425 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1426                                     uint32_t dst)
1427 {
1428     __m64 ms = unpack_32_1x64 (src);
1429     __m64 md = unpack_32_1x64 (dst);
1430     uint32_t sa = src >> 24;
1431     uint32_t da = ~dst >> 24;
1432
1433     if (sa > da)
1434     {
1435         ms = pix_multiply_1x64 (
1436             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1437     }
1438
1439     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1440 }
1441
1442 static force_inline void
1443 core_combine_saturate_u_sse2 (uint32_t *      pd,
1444                               const uint32_t *ps,
1445                               const uint32_t *pm,
1446                               int             w)
1447 {
1448     uint32_t s, d;
1449
1450     uint32_t pack_cmp;
1451     __m128i xmm_src, xmm_dst;
1452
1453     while (w && (unsigned long)pd & 15)
1454     {
1455         s = combine1 (ps, pm);
1456         d = *pd;
1457
1458         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1459         w--;
1460         ps++;
1461         if (pm)
1462             pm++;
1463     }
1464
1465     while (w >= 4)
1466     {
1467         xmm_dst = load_128_aligned  ((__m128i*)pd);
1468         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1469
1470         pack_cmp = _mm_movemask_epi8 (
1471             _mm_cmpgt_epi32 (
1472                 _mm_srli_epi32 (xmm_src, 24),
1473                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1474
1475         /* if some alpha src is grater than respective ~alpha dst */
1476         if (pack_cmp)
1477         {
1478             s = combine1 (ps++, pm);
1479             d = *pd;
1480             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1481             if (pm)
1482                 pm++;
1483
1484             s = combine1 (ps++, pm);
1485             d = *pd;
1486             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1487             if (pm)
1488                 pm++;
1489
1490             s = combine1 (ps++, pm);
1491             d = *pd;
1492             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1493             if (pm)
1494                 pm++;
1495
1496             s = combine1 (ps++, pm);
1497             d = *pd;
1498             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1499             if (pm)
1500                 pm++;
1501         }
1502         else
1503         {
1504             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1505
1506             pd += 4;
1507             ps += 4;
1508             if (pm)
1509                 pm += 4;
1510         }
1511
1512         w -= 4;
1513     }
1514
1515     while (w--)
1516     {
1517         s = combine1 (ps, pm);
1518         d = *pd;
1519
1520         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1521         ps++;
1522         if (pm)
1523             pm++;
1524     }
1525 }
1526
1527 static force_inline void
1528 core_combine_src_ca_sse2 (uint32_t*       pd,
1529                           const uint32_t* ps,
1530                           const uint32_t *pm,
1531                           int             w)
1532 {
1533     uint32_t s, m;
1534
1535     __m128i xmm_src_lo, xmm_src_hi;
1536     __m128i xmm_mask_lo, xmm_mask_hi;
1537     __m128i xmm_dst_lo, xmm_dst_hi;
1538
1539     while (w && (unsigned long)pd & 15)
1540     {
1541         s = *ps++;
1542         m = *pm++;
1543         *pd++ = pack_1x64_32 (
1544             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1545         w--;
1546     }
1547
1548     while (w >= 4)
1549     {
1550         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1551         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1552
1553         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1554         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1555
1556         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1557                             &xmm_mask_lo, &xmm_mask_hi,
1558                             &xmm_dst_lo, &xmm_dst_hi);
1559
1560         save_128_aligned (
1561             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1562
1563         ps += 4;
1564         pd += 4;
1565         pm += 4;
1566         w -= 4;
1567     }
1568
1569     while (w)
1570     {
1571         s = *ps++;
1572         m = *pm++;
1573         *pd++ = pack_1x64_32 (
1574             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1575         w--;
1576     }
1577 }
1578
1579 static force_inline uint32_t
1580 core_combine_over_ca_pixel_sse2 (uint32_t src,
1581                                  uint32_t mask,
1582                                  uint32_t dst)
1583 {
1584     __m64 s = unpack_32_1x64 (src);
1585     __m64 expAlpha = expand_alpha_1x64 (s);
1586     __m64 unpk_mask = unpack_32_1x64 (mask);
1587     __m64 unpk_dst  = unpack_32_1x64 (dst);
1588
1589     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1590 }
1591
1592 static force_inline void
1593 core_combine_over_ca_sse2 (uint32_t*       pd,
1594                            const uint32_t* ps,
1595                            const uint32_t *pm,
1596                            int             w)
1597 {
1598     uint32_t s, m, d;
1599
1600     __m128i xmm_alpha_lo, xmm_alpha_hi;
1601     __m128i xmm_src_lo, xmm_src_hi;
1602     __m128i xmm_dst_lo, xmm_dst_hi;
1603     __m128i xmm_mask_lo, xmm_mask_hi;
1604
1605     while (w && (unsigned long)pd & 15)
1606     {
1607         s = *ps++;
1608         m = *pm++;
1609         d = *pd;
1610
1611         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1612         w--;
1613     }
1614
1615     while (w >= 4)
1616     {
1617         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1618         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1619         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1620
1621         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1622         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1623         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1624
1625         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1626                             &xmm_alpha_lo, &xmm_alpha_hi);
1627
1628         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1629                        &xmm_alpha_lo, &xmm_alpha_hi,
1630                        &xmm_mask_lo, &xmm_mask_hi,
1631                        &xmm_dst_lo, &xmm_dst_hi);
1632
1633         save_128_aligned (
1634             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1635
1636         ps += 4;
1637         pd += 4;
1638         pm += 4;
1639         w -= 4;
1640     }
1641
1642     while (w)
1643     {
1644         s = *ps++;
1645         m = *pm++;
1646         d = *pd;
1647
1648         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1649         w--;
1650     }
1651 }
1652
1653 static force_inline uint32_t
1654 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1655                                          uint32_t mask,
1656                                          uint32_t dst)
1657 {
1658     __m64 d = unpack_32_1x64 (dst);
1659
1660     return pack_1x64_32 (
1661         over_1x64 (d, expand_alpha_1x64 (d),
1662                    pix_multiply_1x64 (unpack_32_1x64 (src),
1663                                       unpack_32_1x64 (mask))));
1664 }
1665
1666 static force_inline void
1667 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1668                                    const uint32_t* ps,
1669                                    const uint32_t *pm,
1670                                    int             w)
1671 {
1672     uint32_t s, m, d;
1673
1674     __m128i xmm_alpha_lo, xmm_alpha_hi;
1675     __m128i xmm_src_lo, xmm_src_hi;
1676     __m128i xmm_dst_lo, xmm_dst_hi;
1677     __m128i xmm_mask_lo, xmm_mask_hi;
1678
1679     while (w && (unsigned long)pd & 15)
1680     {
1681         s = *ps++;
1682         m = *pm++;
1683         d = *pd;
1684
1685         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1686         w--;
1687     }
1688
1689     while (w >= 4)
1690     {
1691         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1692         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1693         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1694
1695         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1696         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1697         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1698
1699         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1700                             &xmm_alpha_lo, &xmm_alpha_hi);
1701         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1702                             &xmm_mask_lo, &xmm_mask_hi,
1703                             &xmm_mask_lo, &xmm_mask_hi);
1704
1705         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1706                     &xmm_alpha_lo, &xmm_alpha_hi,
1707                     &xmm_mask_lo, &xmm_mask_hi);
1708
1709         save_128_aligned (
1710             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1711
1712         ps += 4;
1713         pd += 4;
1714         pm += 4;
1715         w -= 4;
1716     }
1717
1718     while (w)
1719     {
1720         s = *ps++;
1721         m = *pm++;
1722         d = *pd;
1723
1724         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1725         w--;
1726     }
1727 }
1728
1729 static force_inline void
1730 core_combine_in_ca_sse2 (uint32_t *      pd,
1731                          const uint32_t *ps,
1732                          const uint32_t *pm,
1733                          int             w)
1734 {
1735     uint32_t s, m, d;
1736
1737     __m128i xmm_alpha_lo, xmm_alpha_hi;
1738     __m128i xmm_src_lo, xmm_src_hi;
1739     __m128i xmm_dst_lo, xmm_dst_hi;
1740     __m128i xmm_mask_lo, xmm_mask_hi;
1741
1742     while (w && (unsigned long)pd & 15)
1743     {
1744         s = *ps++;
1745         m = *pm++;
1746         d = *pd;
1747
1748         *pd++ = pack_1x64_32 (
1749             pix_multiply_1x64 (
1750                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1751                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1752
1753         w--;
1754     }
1755
1756     while (w >= 4)
1757     {
1758         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1759         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1760         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1761
1762         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1763         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1764         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1765
1766         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1767                             &xmm_alpha_lo, &xmm_alpha_hi);
1768
1769         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1770                             &xmm_mask_lo, &xmm_mask_hi,
1771                             &xmm_dst_lo, &xmm_dst_hi);
1772
1773         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1774                             &xmm_alpha_lo, &xmm_alpha_hi,
1775                             &xmm_dst_lo, &xmm_dst_hi);
1776
1777         save_128_aligned (
1778             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1779
1780         ps += 4;
1781         pd += 4;
1782         pm += 4;
1783         w -= 4;
1784     }
1785
1786     while (w)
1787     {
1788         s = *ps++;
1789         m = *pm++;
1790         d = *pd;
1791
1792         *pd++ = pack_1x64_32 (
1793             pix_multiply_1x64 (
1794                 pix_multiply_1x64 (
1795                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1796                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1797
1798         w--;
1799     }
1800 }
1801
1802 static force_inline void
1803 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1804                                  const uint32_t *ps,
1805                                  const uint32_t *pm,
1806                                  int             w)
1807 {
1808     uint32_t s, m, d;
1809
1810     __m128i xmm_alpha_lo, xmm_alpha_hi;
1811     __m128i xmm_src_lo, xmm_src_hi;
1812     __m128i xmm_dst_lo, xmm_dst_hi;
1813     __m128i xmm_mask_lo, xmm_mask_hi;
1814
1815     while (w && (unsigned long)pd & 15)
1816     {
1817         s = *ps++;
1818         m = *pm++;
1819         d = *pd;
1820
1821         *pd++ = pack_1x64_32 (
1822             pix_multiply_1x64 (
1823                 unpack_32_1x64 (d),
1824                 pix_multiply_1x64 (unpack_32_1x64 (m),
1825                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1826         w--;
1827     }
1828
1829     while (w >= 4)
1830     {
1831         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1832         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1833         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1834
1835         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1836         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1837         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1838
1839         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1840                             &xmm_alpha_lo, &xmm_alpha_hi);
1841         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1842                             &xmm_alpha_lo, &xmm_alpha_hi,
1843                             &xmm_alpha_lo, &xmm_alpha_hi);
1844
1845         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1846                             &xmm_alpha_lo, &xmm_alpha_hi,
1847                             &xmm_dst_lo, &xmm_dst_hi);
1848
1849         save_128_aligned (
1850             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1851
1852         ps += 4;
1853         pd += 4;
1854         pm += 4;
1855         w -= 4;
1856     }
1857
1858     while (w)
1859     {
1860         s = *ps++;
1861         m = *pm++;
1862         d = *pd;
1863
1864         *pd++ = pack_1x64_32 (
1865             pix_multiply_1x64 (
1866                 unpack_32_1x64 (d),
1867                 pix_multiply_1x64 (unpack_32_1x64 (m),
1868                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1869         w--;
1870     }
1871 }
1872
1873 static force_inline void
1874 core_combine_out_ca_sse2 (uint32_t *      pd,
1875                           const uint32_t *ps,
1876                           const uint32_t *pm,
1877                           int             w)
1878 {
1879     uint32_t s, m, d;
1880
1881     __m128i xmm_alpha_lo, xmm_alpha_hi;
1882     __m128i xmm_src_lo, xmm_src_hi;
1883     __m128i xmm_dst_lo, xmm_dst_hi;
1884     __m128i xmm_mask_lo, xmm_mask_hi;
1885
1886     while (w && (unsigned long)pd & 15)
1887     {
1888         s = *ps++;
1889         m = *pm++;
1890         d = *pd;
1891
1892         *pd++ = pack_1x64_32 (
1893             pix_multiply_1x64 (
1894                 pix_multiply_1x64 (
1895                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1896                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1897         w--;
1898     }
1899
1900     while (w >= 4)
1901     {
1902         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1903         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1904         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1905
1906         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1907         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1908         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1909
1910         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1911                             &xmm_alpha_lo, &xmm_alpha_hi);
1912         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1913                       &xmm_alpha_lo, &xmm_alpha_hi);
1914
1915         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1916                             &xmm_mask_lo, &xmm_mask_hi,
1917                             &xmm_dst_lo, &xmm_dst_hi);
1918         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1919                             &xmm_alpha_lo, &xmm_alpha_hi,
1920                             &xmm_dst_lo, &xmm_dst_hi);
1921
1922         save_128_aligned (
1923             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1924
1925         ps += 4;
1926         pd += 4;
1927         pm += 4;
1928         w -= 4;
1929     }
1930
1931     while (w)
1932     {
1933         s = *ps++;
1934         m = *pm++;
1935         d = *pd;
1936
1937         *pd++ = pack_1x64_32 (
1938             pix_multiply_1x64 (
1939                 pix_multiply_1x64 (
1940                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1941                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1942
1943         w--;
1944     }
1945 }
1946
1947 static force_inline void
1948 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
1949                                   const uint32_t *ps,
1950                                   const uint32_t *pm,
1951                                   int             w)
1952 {
1953     uint32_t s, m, d;
1954
1955     __m128i xmm_alpha_lo, xmm_alpha_hi;
1956     __m128i xmm_src_lo, xmm_src_hi;
1957     __m128i xmm_dst_lo, xmm_dst_hi;
1958     __m128i xmm_mask_lo, xmm_mask_hi;
1959
1960     while (w && (unsigned long)pd & 15)
1961     {
1962         s = *ps++;
1963         m = *pm++;
1964         d = *pd;
1965
1966         *pd++ = pack_1x64_32 (
1967             pix_multiply_1x64 (
1968                 unpack_32_1x64 (d),
1969                 negate_1x64 (pix_multiply_1x64 (
1970                                  unpack_32_1x64 (m),
1971                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
1972         w--;
1973     }
1974
1975     while (w >= 4)
1976     {
1977         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1978         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1979         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1980
1981         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1982         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1983         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1984
1985         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1986                             &xmm_alpha_lo, &xmm_alpha_hi);
1987
1988         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1989                             &xmm_alpha_lo, &xmm_alpha_hi,
1990                             &xmm_mask_lo, &xmm_mask_hi);
1991
1992         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1993                       &xmm_mask_lo, &xmm_mask_hi);
1994
1995         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1996                             &xmm_mask_lo, &xmm_mask_hi,
1997                             &xmm_dst_lo, &xmm_dst_hi);
1998
1999         save_128_aligned (
2000             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2001
2002         ps += 4;
2003         pd += 4;
2004         pm += 4;
2005         w -= 4;
2006     }
2007
2008     while (w)
2009     {
2010         s = *ps++;
2011         m = *pm++;
2012         d = *pd;
2013
2014         *pd++ = pack_1x64_32 (
2015             pix_multiply_1x64 (
2016                 unpack_32_1x64 (d),
2017                 negate_1x64 (pix_multiply_1x64 (
2018                                  unpack_32_1x64 (m),
2019                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2020         w--;
2021     }
2022 }
2023
2024 static force_inline uint32_t
2025 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2026                                  uint32_t mask,
2027                                  uint32_t dst)
2028 {
2029     __m64 m = unpack_32_1x64 (mask);
2030     __m64 s = unpack_32_1x64 (src);
2031     __m64 d = unpack_32_1x64 (dst);
2032     __m64 sa = expand_alpha_1x64 (s);
2033     __m64 da = expand_alpha_1x64 (d);
2034
2035     s = pix_multiply_1x64 (s, m);
2036     m = negate_1x64 (pix_multiply_1x64 (m, sa));
2037
2038     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2039 }
2040
2041 static force_inline void
2042 core_combine_atop_ca_sse2 (uint32_t *      pd,
2043                            const uint32_t *ps,
2044                            const uint32_t *pm,
2045                            int             w)
2046 {
2047     uint32_t s, m, d;
2048
2049     __m128i xmm_src_lo, xmm_src_hi;
2050     __m128i xmm_dst_lo, xmm_dst_hi;
2051     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2052     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2053     __m128i xmm_mask_lo, xmm_mask_hi;
2054
2055     while (w && (unsigned long)pd & 15)
2056     {
2057         s = *ps++;
2058         m = *pm++;
2059         d = *pd;
2060
2061         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2062         w--;
2063     }
2064
2065     while (w >= 4)
2066     {
2067         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2068         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2069         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2070
2071         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2072         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2073         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2074
2075         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2076                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2077         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2078                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2079
2080         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2081                             &xmm_mask_lo, &xmm_mask_hi,
2082                             &xmm_src_lo, &xmm_src_hi);
2083         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2084                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2085                             &xmm_mask_lo, &xmm_mask_hi);
2086
2087         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2088
2089         pix_add_multiply_2x128 (
2090             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092             &xmm_dst_lo, &xmm_dst_hi);
2093
2094         save_128_aligned (
2095             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2096
2097         ps += 4;
2098         pd += 4;
2099         pm += 4;
2100         w -= 4;
2101     }
2102
2103     while (w)
2104     {
2105         s = *ps++;
2106         m = *pm++;
2107         d = *pd;
2108
2109         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2110         w--;
2111     }
2112 }
2113
2114 static force_inline uint32_t
2115 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2116                                          uint32_t mask,
2117                                          uint32_t dst)
2118 {
2119     __m64 m = unpack_32_1x64 (mask);
2120     __m64 s = unpack_32_1x64 (src);
2121     __m64 d = unpack_32_1x64 (dst);
2122
2123     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2124     __m64 sa = expand_alpha_1x64 (s);
2125
2126     s = pix_multiply_1x64 (s, m);
2127     m = pix_multiply_1x64 (m, sa);
2128
2129     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2130 }
2131
2132 static force_inline void
2133 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2134                                    const uint32_t *ps,
2135                                    const uint32_t *pm,
2136                                    int             w)
2137 {
2138     uint32_t s, m, d;
2139
2140     __m128i xmm_src_lo, xmm_src_hi;
2141     __m128i xmm_dst_lo, xmm_dst_hi;
2142     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2143     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2144     __m128i xmm_mask_lo, xmm_mask_hi;
2145
2146     while (w && (unsigned long)pd & 15)
2147     {
2148         s = *ps++;
2149         m = *pm++;
2150         d = *pd;
2151
2152         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2153         w--;
2154     }
2155
2156     while (w >= 4)
2157     {
2158         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2159         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2160         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2161
2162         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2163         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2164         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2165
2166         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2167                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2168         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2169                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2170
2171         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2172                             &xmm_mask_lo, &xmm_mask_hi,
2173                             &xmm_src_lo, &xmm_src_hi);
2174         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2175                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2176                             &xmm_mask_lo, &xmm_mask_hi);
2177
2178         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2179                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2180
2181         pix_add_multiply_2x128 (
2182             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2183             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2184             &xmm_dst_lo, &xmm_dst_hi);
2185
2186         save_128_aligned (
2187             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2188
2189         ps += 4;
2190         pd += 4;
2191         pm += 4;
2192         w -= 4;
2193     }
2194
2195     while (w)
2196     {
2197         s = *ps++;
2198         m = *pm++;
2199         d = *pd;
2200
2201         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2202         w--;
2203     }
2204 }
2205
2206 static force_inline uint32_t
2207 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2208                                 uint32_t mask,
2209                                 uint32_t dst)
2210 {
2211     __m64 a = unpack_32_1x64 (mask);
2212     __m64 s = unpack_32_1x64 (src);
2213     __m64 d = unpack_32_1x64 (dst);
2214
2215     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2216                                        a, expand_alpha_1x64 (s)));
2217     __m64 dest      = pix_multiply_1x64 (s, a);
2218     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2219
2220     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2221                                                 &alpha_dst,
2222                                                 &dest,
2223                                                 &alpha_src));
2224 }
2225
2226 static force_inline void
2227 core_combine_xor_ca_sse2 (uint32_t *      pd,
2228                           const uint32_t *ps,
2229                           const uint32_t *pm,
2230                           int             w)
2231 {
2232     uint32_t s, m, d;
2233
2234     __m128i xmm_src_lo, xmm_src_hi;
2235     __m128i xmm_dst_lo, xmm_dst_hi;
2236     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2237     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2238     __m128i xmm_mask_lo, xmm_mask_hi;
2239
2240     while (w && (unsigned long)pd & 15)
2241     {
2242         s = *ps++;
2243         m = *pm++;
2244         d = *pd;
2245
2246         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2247         w--;
2248     }
2249
2250     while (w >= 4)
2251     {
2252         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2253         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2254         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2255
2256         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2257         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2258         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2259
2260         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2261                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2262         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2263                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2264
2265         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2266                             &xmm_mask_lo, &xmm_mask_hi,
2267                             &xmm_src_lo, &xmm_src_hi);
2268         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2269                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2270                             &xmm_mask_lo, &xmm_mask_hi);
2271
2272         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2273                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2274         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2275                       &xmm_mask_lo, &xmm_mask_hi);
2276
2277         pix_add_multiply_2x128 (
2278             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2279             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2280             &xmm_dst_lo, &xmm_dst_hi);
2281
2282         save_128_aligned (
2283             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2284
2285         ps += 4;
2286         pd += 4;
2287         pm += 4;
2288         w -= 4;
2289     }
2290
2291     while (w)
2292     {
2293         s = *ps++;
2294         m = *pm++;
2295         d = *pd;
2296
2297         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2298         w--;
2299     }
2300 }
2301
2302 static force_inline void
2303 core_combine_add_ca_sse2 (uint32_t *      pd,
2304                           const uint32_t *ps,
2305                           const uint32_t *pm,
2306                           int             w)
2307 {
2308     uint32_t s, m, d;
2309
2310     __m128i xmm_src_lo, xmm_src_hi;
2311     __m128i xmm_dst_lo, xmm_dst_hi;
2312     __m128i xmm_mask_lo, xmm_mask_hi;
2313
2314     while (w && (unsigned long)pd & 15)
2315     {
2316         s = *ps++;
2317         m = *pm++;
2318         d = *pd;
2319
2320         *pd++ = pack_1x64_32 (
2321             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2322                                              unpack_32_1x64 (m)),
2323                           unpack_32_1x64 (d)));
2324         w--;
2325     }
2326
2327     while (w >= 4)
2328     {
2329         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2330         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2331         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2332
2333         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2334         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2335         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2336
2337         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2338                             &xmm_mask_lo, &xmm_mask_hi,
2339                             &xmm_src_lo, &xmm_src_hi);
2340
2341         save_128_aligned (
2342             (__m128i*)pd, pack_2x128_128 (
2343                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2344                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2345
2346         ps += 4;
2347         pd += 4;
2348         pm += 4;
2349         w -= 4;
2350     }
2351
2352     while (w)
2353     {
2354         s = *ps++;
2355         m = *pm++;
2356         d = *pd;
2357
2358         *pd++ = pack_1x64_32 (
2359             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2360                                              unpack_32_1x64 (m)),
2361                           unpack_32_1x64 (d)));
2362         w--;
2363     }
2364 }
2365
2366 /* ---------------------------------------------------
2367  * fb_compose_setup_sSE2
2368  */
2369 static force_inline __m64
2370 create_mask_16_64 (uint16_t mask)
2371 {
2372     return _mm_set1_pi16 (mask);
2373 }
2374
2375 static force_inline __m128i
2376 create_mask_16_128 (uint16_t mask)
2377 {
2378     return _mm_set1_epi16 (mask);
2379 }
2380
2381 static force_inline __m64
2382 create_mask_2x32_64 (uint32_t mask0,
2383                      uint32_t mask1)
2384 {
2385     return _mm_set_pi32 (mask0, mask1);
2386 }
2387
2388 /* Work around a code generation bug in Sun Studio 12. */
2389 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2390 # define create_mask_2x32_128(mask0, mask1)                             \
2391     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2392 #else
2393 static force_inline __m128i
2394 create_mask_2x32_128 (uint32_t mask0,
2395                       uint32_t mask1)
2396 {
2397     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2398 }
2399 #endif
2400
2401 /* SSE2 code patch for fbcompose.c */
2402
2403 static void
2404 sse2_combine_over_u (pixman_implementation_t *imp,
2405                      pixman_op_t              op,
2406                      uint32_t *               dst,
2407                      const uint32_t *         src,
2408                      const uint32_t *         mask,
2409                      int                      width)
2410 {
2411     core_combine_over_u_sse2 (dst, src, mask, width);
2412     _mm_empty ();
2413 }
2414
2415 static void
2416 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2417                              pixman_op_t              op,
2418                              uint32_t *               dst,
2419                              const uint32_t *         src,
2420                              const uint32_t *         mask,
2421                              int                      width)
2422 {
2423     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2424     _mm_empty ();
2425 }
2426
2427 static void
2428 sse2_combine_in_u (pixman_implementation_t *imp,
2429                    pixman_op_t              op,
2430                    uint32_t *               dst,
2431                    const uint32_t *         src,
2432                    const uint32_t *         mask,
2433                    int                      width)
2434 {
2435     core_combine_in_u_sse2 (dst, src, mask, width);
2436     _mm_empty ();
2437 }
2438
2439 static void
2440 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2441                            pixman_op_t              op,
2442                            uint32_t *               dst,
2443                            const uint32_t *         src,
2444                            const uint32_t *         mask,
2445                            int                      width)
2446 {
2447     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2448     _mm_empty ();
2449 }
2450
2451 static void
2452 sse2_combine_out_u (pixman_implementation_t *imp,
2453                     pixman_op_t              op,
2454                     uint32_t *               dst,
2455                     const uint32_t *         src,
2456                     const uint32_t *         mask,
2457                     int                      width)
2458 {
2459     core_combine_out_u_sse2 (dst, src, mask, width);
2460     _mm_empty ();
2461 }
2462
2463 static void
2464 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2465                             pixman_op_t              op,
2466                             uint32_t *               dst,
2467                             const uint32_t *         src,
2468                             const uint32_t *         mask,
2469                             int                      width)
2470 {
2471     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2472     _mm_empty ();
2473 }
2474
2475 static void
2476 sse2_combine_atop_u (pixman_implementation_t *imp,
2477                      pixman_op_t              op,
2478                      uint32_t *               dst,
2479                      const uint32_t *         src,
2480                      const uint32_t *         mask,
2481                      int                      width)
2482 {
2483     core_combine_atop_u_sse2 (dst, src, mask, width);
2484     _mm_empty ();
2485 }
2486
2487 static void
2488 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2489                              pixman_op_t              op,
2490                              uint32_t *               dst,
2491                              const uint32_t *         src,
2492                              const uint32_t *         mask,
2493                              int                      width)
2494 {
2495     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2496     _mm_empty ();
2497 }
2498
2499 static void
2500 sse2_combine_xor_u (pixman_implementation_t *imp,
2501                     pixman_op_t              op,
2502                     uint32_t *               dst,
2503                     const uint32_t *         src,
2504                     const uint32_t *         mask,
2505                     int                      width)
2506 {
2507     core_combine_xor_u_sse2 (dst, src, mask, width);
2508     _mm_empty ();
2509 }
2510
2511 static void
2512 sse2_combine_add_u (pixman_implementation_t *imp,
2513                     pixman_op_t              op,
2514                     uint32_t *               dst,
2515                     const uint32_t *         src,
2516                     const uint32_t *         mask,
2517                     int                      width)
2518 {
2519     core_combine_add_u_sse2 (dst, src, mask, width);
2520     _mm_empty ();
2521 }
2522
2523 static void
2524 sse2_combine_saturate_u (pixman_implementation_t *imp,
2525                          pixman_op_t              op,
2526                          uint32_t *               dst,
2527                          const uint32_t *         src,
2528                          const uint32_t *         mask,
2529                          int                      width)
2530 {
2531     core_combine_saturate_u_sse2 (dst, src, mask, width);
2532     _mm_empty ();
2533 }
2534
2535 static void
2536 sse2_combine_src_ca (pixman_implementation_t *imp,
2537                      pixman_op_t              op,
2538                      uint32_t *               dst,
2539                      const uint32_t *         src,
2540                      const uint32_t *         mask,
2541                      int                      width)
2542 {
2543     core_combine_src_ca_sse2 (dst, src, mask, width);
2544     _mm_empty ();
2545 }
2546
2547 static void
2548 sse2_combine_over_ca (pixman_implementation_t *imp,
2549                       pixman_op_t              op,
2550                       uint32_t *               dst,
2551                       const uint32_t *         src,
2552                       const uint32_t *         mask,
2553                       int                      width)
2554 {
2555     core_combine_over_ca_sse2 (dst, src, mask, width);
2556     _mm_empty ();
2557 }
2558
2559 static void
2560 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2561                               pixman_op_t              op,
2562                               uint32_t *               dst,
2563                               const uint32_t *         src,
2564                               const uint32_t *         mask,
2565                               int                      width)
2566 {
2567     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2568     _mm_empty ();
2569 }
2570
2571 static void
2572 sse2_combine_in_ca (pixman_implementation_t *imp,
2573                     pixman_op_t              op,
2574                     uint32_t *               dst,
2575                     const uint32_t *         src,
2576                     const uint32_t *         mask,
2577                     int                      width)
2578 {
2579     core_combine_in_ca_sse2 (dst, src, mask, width);
2580     _mm_empty ();
2581 }
2582
2583 static void
2584 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2585                             pixman_op_t              op,
2586                             uint32_t *               dst,
2587                             const uint32_t *         src,
2588                             const uint32_t *         mask,
2589                             int                      width)
2590 {
2591     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2592     _mm_empty ();
2593 }
2594
2595 static void
2596 sse2_combine_out_ca (pixman_implementation_t *imp,
2597                      pixman_op_t              op,
2598                      uint32_t *               dst,
2599                      const uint32_t *         src,
2600                      const uint32_t *         mask,
2601                      int                      width)
2602 {
2603     core_combine_out_ca_sse2 (dst, src, mask, width);
2604     _mm_empty ();
2605 }
2606
2607 static void
2608 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2609                              pixman_op_t              op,
2610                              uint32_t *               dst,
2611                              const uint32_t *         src,
2612                              const uint32_t *         mask,
2613                              int                      width)
2614 {
2615     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2616     _mm_empty ();
2617 }
2618
2619 static void
2620 sse2_combine_atop_ca (pixman_implementation_t *imp,
2621                       pixman_op_t              op,
2622                       uint32_t *               dst,
2623                       const uint32_t *         src,
2624                       const uint32_t *         mask,
2625                       int                      width)
2626 {
2627     core_combine_atop_ca_sse2 (dst, src, mask, width);
2628     _mm_empty ();
2629 }
2630
2631 static void
2632 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2633                               pixman_op_t              op,
2634                               uint32_t *               dst,
2635                               const uint32_t *         src,
2636                               const uint32_t *         mask,
2637                               int                      width)
2638 {
2639     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2640     _mm_empty ();
2641 }
2642
2643 static void
2644 sse2_combine_xor_ca (pixman_implementation_t *imp,
2645                      pixman_op_t              op,
2646                      uint32_t *               dst,
2647                      const uint32_t *         src,
2648                      const uint32_t *         mask,
2649                      int                      width)
2650 {
2651     core_combine_xor_ca_sse2 (dst, src, mask, width);
2652     _mm_empty ();
2653 }
2654
2655 static void
2656 sse2_combine_add_ca (pixman_implementation_t *imp,
2657                      pixman_op_t              op,
2658                      uint32_t *               dst,
2659                      const uint32_t *         src,
2660                      const uint32_t *         mask,
2661                      int                      width)
2662 {
2663     core_combine_add_ca_sse2 (dst, src, mask, width);
2664     _mm_empty ();
2665 }
2666
2667 /* -------------------------------------------------------------------
2668  * composite_over_n_8888
2669  */
2670
2671 static void
2672 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2673                             pixman_op_t              op,
2674                             pixman_image_t *         src_image,
2675                             pixman_image_t *         mask_image,
2676                             pixman_image_t *         dst_image,
2677                             int32_t                  src_x,
2678                             int32_t                  src_y,
2679                             int32_t                  mask_x,
2680                             int32_t                  mask_y,
2681                             int32_t                  dest_x,
2682                             int32_t                  dest_y,
2683                             int32_t                  width,
2684                             int32_t                  height)
2685 {
2686     uint32_t src;
2687     uint32_t    *dst_line, *dst, d;
2688     int32_t w;
2689     int dst_stride;
2690     __m128i xmm_src, xmm_alpha;
2691     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2692
2693     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2694
2695     if (src == 0)
2696         return;
2697
2698     PIXMAN_IMAGE_GET_LINE (
2699         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2700
2701     xmm_src = expand_pixel_32_1x128 (src);
2702     xmm_alpha = expand_alpha_1x128 (xmm_src);
2703
2704     while (height--)
2705     {
2706         dst = dst_line;
2707
2708         dst_line += dst_stride;
2709         w = width;
2710
2711         while (w && (unsigned long)dst & 15)
2712         {
2713             d = *dst;
2714             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2715                                               _mm_movepi64_pi64 (xmm_alpha),
2716                                               unpack_32_1x64 (d)));
2717             w--;
2718         }
2719
2720         while (w >= 4)
2721         {
2722             xmm_dst = load_128_aligned ((__m128i*)dst);
2723
2724             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2725
2726             over_2x128 (&xmm_src, &xmm_src,
2727                         &xmm_alpha, &xmm_alpha,
2728                         &xmm_dst_lo, &xmm_dst_hi);
2729
2730             /* rebuid the 4 pixel data and save*/
2731             save_128_aligned (
2732                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2733
2734             w -= 4;
2735             dst += 4;
2736         }
2737
2738         while (w)
2739         {
2740             d = *dst;
2741             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2742                                               _mm_movepi64_pi64 (xmm_alpha),
2743                                               unpack_32_1x64 (d)));
2744             w--;
2745         }
2746
2747     }
2748     _mm_empty ();
2749 }
2750
2751 /* ---------------------------------------------------------------------
2752  * composite_over_n_0565
2753  */
2754 static void
2755 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2756                             pixman_op_t              op,
2757                             pixman_image_t *         src_image,
2758                             pixman_image_t *         mask_image,
2759                             pixman_image_t *         dst_image,
2760                             int32_t                  src_x,
2761                             int32_t                  src_y,
2762                             int32_t                  mask_x,
2763                             int32_t                  mask_y,
2764                             int32_t                  dest_x,
2765                             int32_t                  dest_y,
2766                             int32_t                  width,
2767                             int32_t                  height)
2768 {
2769     uint32_t src;
2770     uint16_t    *dst_line, *dst, d;
2771     int32_t w;
2772     int dst_stride;
2773     __m128i xmm_src, xmm_alpha;
2774     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2775
2776     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2777
2778     if (src == 0)
2779         return;
2780
2781     PIXMAN_IMAGE_GET_LINE (
2782         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2783
2784     xmm_src = expand_pixel_32_1x128 (src);
2785     xmm_alpha = expand_alpha_1x128 (xmm_src);
2786
2787     while (height--)
2788     {
2789         dst = dst_line;
2790
2791         dst_line += dst_stride;
2792         w = width;
2793
2794         while (w && (unsigned long)dst & 15)
2795         {
2796             d = *dst;
2797
2798             *dst++ = pack_565_32_16 (
2799                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2800                                          _mm_movepi64_pi64 (xmm_alpha),
2801                                          expand565_16_1x64 (d))));
2802             w--;
2803         }
2804
2805         while (w >= 8)
2806         {
2807             xmm_dst = load_128_aligned ((__m128i*)dst);
2808
2809             unpack_565_128_4x128 (xmm_dst,
2810                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2811
2812             over_2x128 (&xmm_src, &xmm_src,
2813                         &xmm_alpha, &xmm_alpha,
2814                         &xmm_dst0, &xmm_dst1);
2815             over_2x128 (&xmm_src, &xmm_src,
2816                         &xmm_alpha, &xmm_alpha,
2817                         &xmm_dst2, &xmm_dst3);
2818
2819             xmm_dst = pack_565_4x128_128 (
2820                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2821
2822             save_128_aligned ((__m128i*)dst, xmm_dst);
2823
2824             dst += 8;
2825             w -= 8;
2826         }
2827
2828         while (w--)
2829         {
2830             d = *dst;
2831             *dst++ = pack_565_32_16 (
2832                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2833                                          _mm_movepi64_pi64 (xmm_alpha),
2834                                          expand565_16_1x64 (d))));
2835         }
2836     }
2837
2838     _mm_empty ();
2839 }
2840
2841 /* ------------------------------
2842  * composite_add_n_8888_8888_ca
2843  */
2844 static void
2845 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2846                                    pixman_op_t              op,
2847                                    pixman_image_t *         src_image,
2848                                    pixman_image_t *         mask_image,
2849                                    pixman_image_t *         dst_image,
2850                                    int32_t                  src_x,
2851                                    int32_t                  src_y,
2852                                    int32_t                  mask_x,
2853                                    int32_t                  mask_y,
2854                                    int32_t                  dest_x,
2855                                    int32_t                  dest_y,
2856                                    int32_t                  width,
2857                                    int32_t                  height)
2858 {
2859     uint32_t src, srca;
2860     uint32_t    *dst_line, d;
2861     uint32_t    *mask_line, m;
2862     uint32_t pack_cmp;
2863     int dst_stride, mask_stride;
2864
2865     __m128i xmm_src, xmm_alpha;
2866     __m128i xmm_dst;
2867     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2868
2869     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2870
2871     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2872     srca = src >> 24;
2873
2874     if (src == 0)
2875         return;
2876
2877     PIXMAN_IMAGE_GET_LINE (
2878         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2879     PIXMAN_IMAGE_GET_LINE (
2880         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2881
2882     xmm_src = _mm_unpacklo_epi8 (
2883         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2884     xmm_alpha = expand_alpha_1x128 (xmm_src);
2885     mmx_src   = _mm_movepi64_pi64 (xmm_src);
2886     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2887
2888     while (height--)
2889     {
2890         int w = width;
2891         const uint32_t *pm = (uint32_t *)mask_line;
2892         uint32_t *pd = (uint32_t *)dst_line;
2893
2894         dst_line += dst_stride;
2895         mask_line += mask_stride;
2896
2897         while (w && (unsigned long)pd & 15)
2898         {
2899             m = *pm++;
2900
2901             if (m)
2902             {
2903                 d = *pd;
2904
2905                 mmx_mask = unpack_32_1x64 (m);
2906                 mmx_dest = unpack_32_1x64 (d);
2907
2908                 *pd = pack_1x64_32 (
2909                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2910             }
2911
2912             pd++;
2913             w--;
2914         }
2915
2916         while (w >= 4)
2917         {
2918             xmm_mask = load_128_unaligned ((__m128i*)pm);
2919
2920             pack_cmp =
2921                 _mm_movemask_epi8 (
2922                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2923
2924             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2925             if (pack_cmp != 0xffff)
2926             {
2927                 xmm_dst = load_128_aligned ((__m128i*)pd);
2928
2929                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2930
2931                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2932                                     &xmm_mask_lo, &xmm_mask_hi,
2933                                     &xmm_mask_lo, &xmm_mask_hi);
2934                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2935
2936                 save_128_aligned (
2937                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2938             }
2939
2940             pd += 4;
2941             pm += 4;
2942             w -= 4;
2943         }
2944
2945         while (w)
2946         {
2947             m = *pm++;
2948
2949             if (m)
2950             {
2951                 d = *pd;
2952
2953                 mmx_mask = unpack_32_1x64 (m);
2954                 mmx_dest = unpack_32_1x64 (d);
2955
2956                 *pd = pack_1x64_32 (
2957                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2958             }
2959
2960             pd++;
2961             w--;
2962         }
2963     }
2964
2965     _mm_empty ();
2966 }
2967
2968 /* ---------------------------------------------------------------------------
2969  * composite_over_n_8888_8888_ca
2970  */
2971
2972 static void
2973 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2974                                     pixman_op_t              op,
2975                                     pixman_image_t *         src_image,
2976                                     pixman_image_t *         mask_image,
2977                                     pixman_image_t *         dst_image,
2978                                     int32_t                  src_x,
2979                                     int32_t                  src_y,
2980                                     int32_t                  mask_x,
2981                                     int32_t                  mask_y,
2982                                     int32_t                  dest_x,
2983                                     int32_t                  dest_y,
2984                                     int32_t                  width,
2985                                     int32_t                  height)
2986 {
2987     uint32_t src;
2988     uint32_t    *dst_line, d;
2989     uint32_t    *mask_line, m;
2990     uint32_t pack_cmp;
2991     int dst_stride, mask_stride;
2992
2993     __m128i xmm_src, xmm_alpha;
2994     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2995     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2996
2997     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2998
2999     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3000
3001     if (src == 0)
3002         return;
3003
3004     PIXMAN_IMAGE_GET_LINE (
3005         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3006     PIXMAN_IMAGE_GET_LINE (
3007         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3008
3009     xmm_src = _mm_unpacklo_epi8 (
3010         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3011     xmm_alpha = expand_alpha_1x128 (xmm_src);
3012     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3013     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3014
3015     while (height--)
3016     {
3017         int w = width;
3018         const uint32_t *pm = (uint32_t *)mask_line;
3019         uint32_t *pd = (uint32_t *)dst_line;
3020
3021         dst_line += dst_stride;
3022         mask_line += mask_stride;
3023
3024         while (w && (unsigned long)pd & 15)
3025         {
3026             m = *pm++;
3027
3028             if (m)
3029             {
3030                 d = *pd;
3031                 mmx_mask = unpack_32_1x64 (m);
3032                 mmx_dest = unpack_32_1x64 (d);
3033
3034                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3035                                                   &mmx_alpha,
3036                                                   &mmx_mask,
3037                                                   &mmx_dest));
3038             }
3039
3040             pd++;
3041             w--;
3042         }
3043
3044         while (w >= 4)
3045         {
3046             xmm_mask = load_128_unaligned ((__m128i*)pm);
3047
3048             pack_cmp =
3049                 _mm_movemask_epi8 (
3050                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3051
3052             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3053             if (pack_cmp != 0xffff)
3054             {
3055                 xmm_dst = load_128_aligned ((__m128i*)pd);
3056
3057                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3058                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3059
3060                 in_over_2x128 (&xmm_src, &xmm_src,
3061                                &xmm_alpha, &xmm_alpha,
3062                                &xmm_mask_lo, &xmm_mask_hi,
3063                                &xmm_dst_lo, &xmm_dst_hi);
3064
3065                 save_128_aligned (
3066                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3067             }
3068
3069             pd += 4;
3070             pm += 4;
3071             w -= 4;
3072         }
3073
3074         while (w)
3075         {
3076             m = *pm++;
3077
3078             if (m)
3079             {
3080                 d = *pd;
3081                 mmx_mask = unpack_32_1x64 (m);
3082                 mmx_dest = unpack_32_1x64 (d);
3083
3084                 *pd = pack_1x64_32 (
3085                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3086             }
3087
3088             pd++;
3089             w--;
3090         }
3091     }
3092
3093     _mm_empty ();
3094 }
3095
3096 /*---------------------------------------------------------------------
3097  * composite_over_8888_n_8888
3098  */
3099
3100 static void
3101 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3102                                  pixman_op_t              op,
3103                                  pixman_image_t *         src_image,
3104                                  pixman_image_t *         mask_image,
3105                                  pixman_image_t *         dst_image,
3106                                  int32_t                  src_x,
3107                                  int32_t                  src_y,
3108                                  int32_t                  mask_x,
3109                                  int32_t                  mask_y,
3110                                  int32_t                  dest_x,
3111                                  int32_t                  dest_y,
3112                                  int32_t                  width,
3113                                  int32_t                  height)
3114 {
3115     uint32_t    *dst_line, *dst;
3116     uint32_t    *src_line, *src;
3117     uint32_t mask;
3118     int32_t w;
3119     int dst_stride, src_stride;
3120
3121     __m128i xmm_mask;
3122     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3123     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3124     __m128i xmm_alpha_lo, xmm_alpha_hi;
3125
3126     PIXMAN_IMAGE_GET_LINE (
3127         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3128     PIXMAN_IMAGE_GET_LINE (
3129         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3130
3131     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3132
3133     xmm_mask = create_mask_16_128 (mask >> 24);
3134
3135     while (height--)
3136     {
3137         dst = dst_line;
3138         dst_line += dst_stride;
3139         src = src_line;
3140         src_line += src_stride;
3141         w = width;
3142
3143         while (w && (unsigned long)dst & 15)
3144         {
3145             uint32_t s = *src++;
3146
3147             if (s)
3148             {
3149                 uint32_t d = *dst;
3150
3151                 __m64 ms = unpack_32_1x64 (s);
3152                 __m64 alpha    = expand_alpha_1x64 (ms);
3153                 __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3154                 __m64 alpha_dst = unpack_32_1x64 (d);
3155
3156                 *dst = pack_1x64_32 (
3157                     in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3158             }
3159             dst++;
3160             w--;
3161         }
3162
3163         while (w >= 4)
3164         {
3165             xmm_src = load_128_unaligned ((__m128i*)src);
3166
3167             if (!is_zero (xmm_src))
3168             {
3169                 xmm_dst = load_128_aligned ((__m128i*)dst);
3170
3171                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3172                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3173                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3174                                     &xmm_alpha_lo, &xmm_alpha_hi);
3175
3176                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3177                                &xmm_alpha_lo, &xmm_alpha_hi,
3178                                &xmm_mask, &xmm_mask,
3179                                &xmm_dst_lo, &xmm_dst_hi);
3180
3181                 save_128_aligned (
3182                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3183             }
3184
3185             dst += 4;
3186             src += 4;
3187             w -= 4;
3188         }
3189
3190         while (w)
3191         {
3192             uint32_t s = *src++;
3193
3194             if (s)
3195             {
3196                 uint32_t d = *dst;
3197
3198                 __m64 ms = unpack_32_1x64 (s);
3199                 __m64 alpha = expand_alpha_1x64 (ms);
3200                 __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3201                 __m64 dest  = unpack_32_1x64 (d);
3202
3203                 *dst = pack_1x64_32 (
3204                     in_over_1x64 (&ms, &alpha, &mask, &dest));
3205             }
3206
3207             dst++;
3208             w--;
3209         }
3210     }
3211
3212     _mm_empty ();
3213 }
3214
3215 /*---------------------------------------------------------------------
3216  * composite_over_8888_n_8888
3217  */
3218
3219 static void
3220 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
3221                               pixman_op_t              op,
3222                               pixman_image_t *         src_image,
3223                               pixman_image_t *         mask_image,
3224                               pixman_image_t *         dst_image,
3225                               int32_t                  src_x,
3226                               int32_t                  src_y,
3227                               int32_t                  mask_x,
3228                               int32_t                  mask_y,
3229                               int32_t                  dest_x,
3230                               int32_t                  dest_y,
3231                               int32_t                  width,
3232                               int32_t                  height)
3233 {
3234     uint32_t    *dst_line, *dst;
3235     uint32_t    *src_line, *src;
3236     int32_t w;
3237     int dst_stride, src_stride;
3238
3239
3240     PIXMAN_IMAGE_GET_LINE (
3241         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3242     PIXMAN_IMAGE_GET_LINE (
3243         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3244
3245     while (height--)
3246     {
3247         dst = dst_line;
3248         dst_line += dst_stride;
3249         src = src_line;
3250         src_line += src_stride;
3251         w = width;
3252
3253         while (w && (unsigned long)dst & 15)
3254         {
3255             *dst++ = *src++ | 0xff000000;
3256             w--;
3257         }
3258
3259         while (w >= 16)
3260         {
3261             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
3262
3263             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
3264             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
3265             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
3266             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
3267
3268             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
3269             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
3270             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
3271             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
3272
3273             dst += 16;
3274             src += 16;
3275             w -= 16;
3276         }
3277
3278         while (w)
3279         {
3280             *dst++ = *src++ | 0xff000000;
3281             w--;
3282         }
3283     }
3284
3285     _mm_empty ();
3286 }
3287
3288 /* ---------------------------------------------------------------------
3289  * composite_over_x888_n_8888
3290  */
3291 static void
3292 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3293                                  pixman_op_t              op,
3294                                  pixman_image_t *         src_image,
3295                                  pixman_image_t *         mask_image,
3296                                  pixman_image_t *         dst_image,
3297                                  int32_t                  src_x,
3298                                  int32_t                  src_y,
3299                                  int32_t                  mask_x,
3300                                  int32_t                  mask_y,
3301                                  int32_t                  dest_x,
3302                                  int32_t                  dest_y,
3303                                  int32_t                  width,
3304                                  int32_t                  height)
3305 {
3306     uint32_t    *dst_line, *dst;
3307     uint32_t    *src_line, *src;
3308     uint32_t mask;
3309     int dst_stride, src_stride;
3310     int32_t w;
3311
3312     __m128i xmm_mask, xmm_alpha;
3313     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3314     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3315
3316     PIXMAN_IMAGE_GET_LINE (
3317         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3318     PIXMAN_IMAGE_GET_LINE (
3319         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3320
3321     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3322
3323     xmm_mask = create_mask_16_128 (mask >> 24);
3324     xmm_alpha = mask_00ff;
3325
3326     while (height--)
3327     {
3328         dst = dst_line;
3329         dst_line += dst_stride;
3330         src = src_line;
3331         src_line += src_stride;
3332         w = width;
3333
3334         while (w && (unsigned long)dst & 15)
3335         {
3336             uint32_t s = (*src++) | 0xff000000;
3337             uint32_t d = *dst;
3338
3339             __m64 src   = unpack_32_1x64 (s);
3340             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3341             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3342             __m64 dest  = unpack_32_1x64 (d);
3343
3344             *dst++ = pack_1x64_32 (
3345                 in_over_1x64 (&src, &alpha, &mask, &dest));
3346
3347             w--;
3348         }
3349
3350         while (w >= 4)
3351         {
3352             xmm_src = _mm_or_si128 (
3353                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3354             xmm_dst = load_128_aligned ((__m128i*)dst);
3355
3356             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3357             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3358
3359             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3360                            &xmm_alpha, &xmm_alpha,
3361                            &xmm_mask, &xmm_mask,
3362                            &xmm_dst_lo, &xmm_dst_hi);
3363
3364             save_128_aligned (
3365                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3366
3367             dst += 4;
3368             src += 4;
3369             w -= 4;
3370
3371         }
3372
3373         while (w)
3374         {
3375             uint32_t s = (*src++) | 0xff000000;
3376             uint32_t d = *dst;
3377
3378             __m64 src  = unpack_32_1x64 (s);
3379             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3380             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3381             __m64 dest  = unpack_32_1x64 (d);
3382
3383             *dst++ = pack_1x64_32 (
3384                 in_over_1x64 (&src, &alpha, &mask, &dest));
3385
3386             w--;
3387         }
3388     }
3389
3390     _mm_empty ();
3391 }
3392
3393 /* --------------------------------------------------------------------
3394  * composite_over_8888_8888
3395  */
3396 static void
3397 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3398                                pixman_op_t              op,
3399                                pixman_image_t *         src_image,
3400                                pixman_image_t *         mask_image,
3401                                pixman_image_t *         dst_image,
3402                                int32_t                  src_x,
3403                                int32_t                  src_y,
3404                                int32_t                  mask_x,
3405                                int32_t                  mask_y,
3406                                int32_t                  dest_x,
3407                                int32_t                  dest_y,
3408                                int32_t                  width,
3409                                int32_t                  height)
3410 {
3411     int dst_stride, src_stride;
3412     uint32_t    *dst_line, *dst;
3413     uint32_t    *src_line, *src;
3414
3415     PIXMAN_IMAGE_GET_LINE (
3416         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3417     PIXMAN_IMAGE_GET_LINE (
3418         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3419
3420     dst = dst_line;
3421     src = src_line;
3422
3423     while (height--)
3424     {
3425         core_combine_over_u_sse2 (dst, src, NULL, width);
3426
3427         dst += dst_stride;
3428         src += src_stride;
3429     }
3430     _mm_empty ();
3431 }
3432
3433 /* ------------------------------------------------------------------
3434  * composite_over_8888_0565
3435  */
3436 static force_inline uint16_t
3437 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3438 {
3439     __m64 ms;
3440
3441     ms = unpack_32_1x64 (src);
3442     return pack_565_32_16 (
3443         pack_1x64_32 (
3444             over_1x64 (
3445                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3446 }
3447
3448 static void
3449 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3450                                pixman_op_t              op,
3451                                pixman_image_t *         src_image,
3452                                pixman_image_t *         mask_image,
3453                                pixman_image_t *         dst_image,
3454                                int32_t                  src_x,
3455                                int32_t                  src_y,
3456                                int32_t                  mask_x,
3457                                int32_t                  mask_y,
3458                                int32_t                  dest_x,
3459                                int32_t                  dest_y,
3460                                int32_t                  width,
3461                                int32_t                  height)
3462 {
3463     uint16_t    *dst_line, *dst, d;
3464     uint32_t    *src_line, *src, s;
3465     int dst_stride, src_stride;
3466     int32_t w;
3467
3468     __m128i xmm_alpha_lo, xmm_alpha_hi;
3469     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3470     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3471
3472     PIXMAN_IMAGE_GET_LINE (
3473         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3474     PIXMAN_IMAGE_GET_LINE (
3475         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3476
3477 #if 0
3478     /* FIXME
3479      *
3480      * I copy the code from MMX one and keep the fixme.
3481      * If it's a problem there, probably is a problem here.
3482      */
3483     assert (src_image->drawable == mask_image->drawable);
3484 #endif
3485
3486     while (height--)
3487     {
3488         dst = dst_line;
3489         src = src_line;
3490
3491         dst_line += dst_stride;
3492         src_line += src_stride;
3493         w = width;
3494
3495         /* Align dst on a 16-byte boundary */
3496         while (w &&
3497                ((unsigned long)dst & 15))
3498         {
3499             s = *src++;
3500             d = *dst;
3501
3502             *dst++ = composite_over_8888_0565pixel (s, d);
3503             w--;
3504         }
3505
3506         /* It's a 8 pixel loop */
3507         while (w >= 8)
3508         {
3509             /* I'm loading unaligned because I'm not sure
3510              * about the address alignment.
3511              */
3512             xmm_src = load_128_unaligned ((__m128i*) src);
3513             xmm_dst = load_128_aligned ((__m128i*) dst);
3514
3515             /* Unpacking */
3516             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3517             unpack_565_128_4x128 (xmm_dst,
3518                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3519             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3520                                 &xmm_alpha_lo, &xmm_alpha_hi);
3521
3522             /* I'm loading next 4 pixels from memory
3523              * before to optimze the memory read.
3524              */
3525             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3526
3527             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3528                         &xmm_alpha_lo, &xmm_alpha_hi,
3529                         &xmm_dst0, &xmm_dst1);
3530
3531             /* Unpacking */
3532             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3533             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3534                                 &xmm_alpha_lo, &xmm_alpha_hi);
3535
3536             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3537                         &xmm_alpha_lo, &xmm_alpha_hi,
3538                         &xmm_dst2, &xmm_dst3);
3539
3540             save_128_aligned (
3541                 (__m128i*)dst, pack_565_4x128_128 (
3542                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3543
3544             w -= 8;
3545             dst += 8;
3546             src += 8;
3547         }
3548
3549         while (w--)
3550         {
3551             s = *src++;
3552             d = *dst;
3553
3554             *dst++ = composite_over_8888_0565pixel (s, d);
3555         }
3556     }
3557
3558     _mm_empty ();
3559 }
3560
3561 /* -----------------------------------------------------------------
3562  * composite_over_n_8_8888
3563  */
3564
3565 static void
3566 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3567                               pixman_op_t              op,
3568                               pixman_image_t *         src_image,
3569                               pixman_image_t *         mask_image,
3570                               pixman_image_t *         dst_image,
3571                               int32_t                  src_x,
3572                               int32_t                  src_y,
3573                               int32_t                  mask_x,
3574                               int32_t                  mask_y,
3575                               int32_t                  dest_x,
3576                               int32_t                  dest_y,
3577                               int32_t                  width,
3578                               int32_t                  height)
3579 {
3580     uint32_t src, srca;
3581     uint32_t *dst_line, *dst;
3582     uint8_t *mask_line, *mask;
3583     int dst_stride, mask_stride;
3584     int32_t w;
3585     uint32_t m, d;
3586
3587     __m128i xmm_src, xmm_alpha, xmm_def;
3588     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3589     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3590
3591     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3592
3593     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3594
3595     srca = src >> 24;
3596     if (src == 0)
3597         return;
3598
3599     PIXMAN_IMAGE_GET_LINE (
3600         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3601     PIXMAN_IMAGE_GET_LINE (
3602         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3603
3604     xmm_def = create_mask_2x32_128 (src, src);
3605     xmm_src = expand_pixel_32_1x128 (src);
3606     xmm_alpha = expand_alpha_1x128 (xmm_src);
3607     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3608     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3609
3610     while (height--)
3611     {
3612         dst = dst_line;
3613         dst_line += dst_stride;
3614         mask = mask_line;
3615         mask_line += mask_stride;
3616         w = width;
3617
3618         while (w && (unsigned long)dst & 15)
3619         {
3620             uint8_t m = *mask++;
3621
3622             if (m)
3623             {
3624                 d = *dst;
3625                 mmx_mask = expand_pixel_8_1x64 (m);
3626                 mmx_dest = unpack_32_1x64 (d);
3627
3628                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3629                                                    &mmx_alpha,
3630                                                    &mmx_mask,
3631                                                    &mmx_dest));
3632             }
3633
3634             w--;
3635             dst++;
3636         }
3637
3638         while (w >= 4)
3639         {
3640             m = *((uint32_t*)mask);
3641
3642             if (srca == 0xff && m == 0xffffffff)
3643             {
3644                 save_128_aligned ((__m128i*)dst, xmm_def);
3645             }
3646             else if (m)
3647             {
3648                 xmm_dst = load_128_aligned ((__m128i*) dst);
3649                 xmm_mask = unpack_32_1x128 (m);
3650                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3651
3652                 /* Unpacking */
3653                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3654                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3655
3656                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3657                                         &xmm_mask_lo, &xmm_mask_hi);
3658
3659                 in_over_2x128 (&xmm_src, &xmm_src,
3660                                &xmm_alpha, &xmm_alpha,
3661                                &xmm_mask_lo, &xmm_mask_hi,
3662                                &xmm_dst_lo, &xmm_dst_hi);
3663
3664                 save_128_aligned (
3665                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3666             }
3667
3668             w -= 4;
3669             dst += 4;
3670             mask += 4;
3671         }
3672
3673         while (w)
3674         {
3675             uint8_t m = *mask++;
3676
3677             if (m)
3678             {
3679                 d = *dst;
3680                 mmx_mask = expand_pixel_8_1x64 (m);
3681                 mmx_dest = unpack_32_1x64 (d);
3682
3683                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3684                                                    &mmx_alpha,
3685                                                    &mmx_mask,
3686                                                    &mmx_dest));
3687             }
3688
3689             w--;
3690             dst++;
3691         }
3692     }
3693
3694     _mm_empty ();
3695 }
3696
3697 /* ----------------------------------------------------------------
3698  * composite_over_n_8_8888
3699  */
3700
3701 pixman_bool_t
3702 pixman_fill_sse2 (uint32_t *bits,
3703                   int       stride,
3704                   int       bpp,
3705                   int       x,
3706                   int       y,
3707                   int       width,
3708                   int       height,
3709                   uint32_t  data)
3710 {
3711     uint32_t byte_width;
3712     uint8_t         *byte_line;
3713
3714     __m128i xmm_def;
3715
3716     if (bpp == 8)
3717     {
3718         uint8_t b;
3719         uint16_t w;
3720
3721         stride = stride * (int) sizeof (uint32_t) / 1;
3722         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3723         byte_width = width;
3724         stride *= 1;
3725
3726         b = data & 0xff;
3727         w = (b << 8) | b;
3728         data = (w << 16) | w;
3729     }
3730     else if (bpp == 16)
3731     {
3732         stride = stride * (int) sizeof (uint32_t) / 2;
3733         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3734         byte_width = 2 * width;
3735         stride *= 2;
3736
3737         data = (data & 0xffff) * 0x00010001;
3738     }
3739     else if (bpp == 32)
3740     {
3741         stride = stride * (int) sizeof (uint32_t) / 4;
3742         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3743         byte_width = 4 * width;
3744         stride *= 4;
3745     }
3746     else
3747     {
3748         return FALSE;
3749     }
3750
3751     xmm_def = create_mask_2x32_128 (data, data);
3752
3753     while (height--)
3754     {
3755         int w;
3756         uint8_t *d = byte_line;
3757         byte_line += stride;
3758         w = byte_width;
3759
3760         while (w >= 1 && ((unsigned long)d & 1))
3761         {
3762             *(uint8_t *)d = data;
3763             w -= 1;
3764             d += 1;
3765         }
3766
3767         while (w >= 2 && ((unsigned long)d & 3))
3768         {
3769             *(uint16_t *)d = data;
3770             w -= 2;
3771             d += 2;
3772         }
3773
3774         while (w >= 4 && ((unsigned long)d & 15))
3775         {
3776             *(uint32_t *)d = data;
3777
3778             w -= 4;
3779             d += 4;
3780         }
3781
3782         while (w >= 128)
3783         {
3784             save_128_aligned ((__m128i*)(d),     xmm_def);
3785             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3786             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3787             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3788             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3789             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3790             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3791             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3792
3793             d += 128;
3794             w -= 128;
3795         }
3796
3797         if (w >= 64)
3798         {
3799             save_128_aligned ((__m128i*)(d),     xmm_def);
3800             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3801             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3802             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3803
3804             d += 64;
3805             w -= 64;
3806         }
3807
3808         if (w >= 32)
3809         {
3810             save_128_aligned ((__m128i*)(d),     xmm_def);
3811             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3812
3813             d += 32;
3814             w -= 32;
3815         }
3816
3817         if (w >= 16)
3818         {
3819             save_128_aligned ((__m128i*)(d),     xmm_def);
3820
3821             d += 16;
3822             w -= 16;
3823         }
3824
3825         while (w >= 4)
3826         {
3827             *(uint32_t *)d = data;
3828
3829             w -= 4;
3830             d += 4;
3831         }
3832
3833         if (w >= 2)
3834         {
3835             *(uint16_t *)d = data;
3836             w -= 2;
3837             d += 2;
3838         }
3839
3840         if (w >= 1)
3841         {
3842             *(uint8_t *)d = data;
3843             w -= 1;
3844             d += 1;
3845         }
3846     }
3847
3848     _mm_empty ();
3849     return TRUE;
3850 }
3851
3852 static void
3853 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3854                              pixman_op_t              op,
3855                              pixman_image_t *         src_image,
3856                              pixman_image_t *         mask_image,
3857                              pixman_image_t *         dst_image,
3858                              int32_t                  src_x,
3859                              int32_t                  src_y,
3860                              int32_t                  mask_x,
3861                              int32_t                  mask_y,
3862                              int32_t                  dest_x,
3863                              int32_t                  dest_y,
3864                              int32_t                  width,
3865                              int32_t                  height)
3866 {
3867     uint32_t src, srca;
3868     uint32_t    *dst_line, *dst;
3869     uint8_t     *mask_line, *mask;
3870     int dst_stride, mask_stride;
3871     int32_t w;
3872     uint32_t m;
3873
3874     __m128i xmm_src, xmm_def;
3875     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3876
3877     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3878
3879     srca = src >> 24;
3880     if (src == 0)
3881     {
3882         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3883                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
3884                           dest_x, dest_y, width, height, 0);
3885         return;
3886     }
3887
3888     PIXMAN_IMAGE_GET_LINE (
3889         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3890     PIXMAN_IMAGE_GET_LINE (
3891         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3892
3893     xmm_def = create_mask_2x32_128 (src, src);
3894     xmm_src = expand_pixel_32_1x128 (src);
3895
3896     while (height--)
3897     {
3898         dst = dst_line;
3899         dst_line += dst_stride;
3900         mask = mask_line;
3901         mask_line += mask_stride;
3902         w = width;
3903
3904         while (w && (unsigned long)dst & 15)
3905         {
3906             uint8_t m = *mask++;
3907
3908             if (m)
3909             {
3910                 *dst = pack_1x64_32 (
3911                     pix_multiply_1x64 (
3912                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3913             }
3914             else
3915             {
3916                 *dst = 0;
3917             }
3918
3919             w--;
3920             dst++;
3921         }
3922
3923         while (w >= 4)
3924         {
3925             m = *((uint32_t*)mask);
3926
3927             if (srca == 0xff && m == 0xffffffff)
3928             {
3929                 save_128_aligned ((__m128i*)dst, xmm_def);
3930             }
3931             else if (m)
3932             {
3933                 xmm_mask = unpack_32_1x128 (m);
3934                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3935
3936                 /* Unpacking */
3937                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3938
3939                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3940                                         &xmm_mask_lo, &xmm_mask_hi);
3941
3942                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3943                                     &xmm_mask_lo, &xmm_mask_hi,
3944                                     &xmm_mask_lo, &xmm_mask_hi);
3945
3946                 save_128_aligned (
3947                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3948             }
3949             else
3950             {
3951                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3952             }
3953
3954             w -= 4;
3955             dst += 4;
3956             mask += 4;
3957         }
3958
3959         while (w)
3960         {
3961             uint8_t m = *mask++;
3962
3963             if (m)
3964             {
3965                 *dst = pack_1x64_32 (
3966                     pix_multiply_1x64 (
3967                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3968             }
3969             else
3970             {
3971                 *dst = 0;
3972             }
3973
3974             w--;
3975             dst++;
3976         }
3977     }
3978
3979     _mm_empty ();
3980 }
3981
3982 /*-----------------------------------------------------------------------
3983  * composite_over_n_8_0565
3984  */
3985
3986 static void
3987 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3988                               pixman_op_t              op,
3989                               pixman_image_t *         src_image,
3990                               pixman_image_t *         mask_image,
3991                               pixman_image_t *         dst_image,
3992                               int32_t                  src_x,
3993                               int32_t                  src_y,
3994                               int32_t                  mask_x,
3995                               int32_t                  mask_y,
3996                               int32_t                  dest_x,
3997                               int32_t                  dest_y,
3998                               int32_t                  width,
3999                               int32_t                  height)
4000 {
4001     uint32_t src, srca;
4002     uint16_t    *dst_line, *dst, d;
4003     uint8_t     *mask_line, *mask;
4004     int dst_stride, mask_stride;
4005     int32_t w;
4006     uint32_t m;
4007     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4008
4009     __m128i xmm_src, xmm_alpha;
4010     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4011     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4012
4013     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4014
4015     srca = src >> 24;
4016     if (src == 0)
4017         return;
4018
4019     PIXMAN_IMAGE_GET_LINE (
4020         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4021     PIXMAN_IMAGE_GET_LINE (
4022         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4023
4024     xmm_src = expand_pixel_32_1x128 (src);
4025     xmm_alpha = expand_alpha_1x128 (xmm_src);
4026     mmx_src = _mm_movepi64_pi64 (xmm_src);
4027     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4028
4029     while (height--)
4030     {
4031         dst = dst_line;
4032         dst_line += dst_stride;
4033         mask = mask_line;
4034         mask_line += mask_stride;
4035         w = width;
4036
4037         while (w && (unsigned long)dst & 15)
4038         {
4039             m = *mask++;
4040
4041             if (m)
4042             {
4043                 d = *dst;
4044                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4045                 mmx_dest = expand565_16_1x64 (d);
4046
4047                 *dst = pack_565_32_16 (
4048                     pack_1x64_32 (
4049                         in_over_1x64 (
4050                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4051             }
4052
4053             w--;
4054             dst++;
4055         }
4056
4057         while (w >= 8)
4058         {
4059             xmm_dst = load_128_aligned ((__m128i*) dst);
4060             unpack_565_128_4x128 (xmm_dst,
4061                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4062
4063             m = *((uint32_t*)mask);
4064             mask += 4;
4065
4066             if (m)
4067             {
4068                 xmm_mask = unpack_32_1x128 (m);
4069                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4070
4071                 /* Unpacking */
4072                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4073
4074                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4075                                         &xmm_mask_lo, &xmm_mask_hi);
4076
4077                 in_over_2x128 (&xmm_src, &xmm_src,
4078                                &xmm_alpha, &xmm_alpha,
4079                                &xmm_mask_lo, &xmm_mask_hi,
4080                                &xmm_dst0, &xmm_dst1);
4081             }
4082
4083             m = *((uint32_t*)mask);
4084             mask += 4;
4085
4086             if (m)
4087             {
4088                 xmm_mask = unpack_32_1x128 (m);
4089                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4090
4091                 /* Unpacking */
4092                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4093
4094                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4095                                         &xmm_mask_lo, &xmm_mask_hi);
4096                 in_over_2x128 (&xmm_src, &xmm_src,
4097                                &xmm_alpha, &xmm_alpha,
4098                                &xmm_mask_lo, &xmm_mask_hi,
4099                                &xmm_dst2, &xmm_dst3);
4100             }
4101
4102             save_128_aligned (
4103                 (__m128i*)dst, pack_565_4x128_128 (
4104                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4105
4106             w -= 8;
4107             dst += 8;
4108         }
4109
4110         while (w)
4111         {
4112             m = *mask++;
4113
4114             if (m)
4115             {
4116                 d = *dst;
4117                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4118                 mmx_dest = expand565_16_1x64 (d);
4119
4120                 *dst = pack_565_32_16 (
4121                     pack_1x64_32 (
4122                         in_over_1x64 (
4123                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4124             }
4125
4126             w--;
4127             dst++;
4128         }
4129     }
4130
4131     _mm_empty ();
4132 }
4133
4134 /* -----------------------------------------------------------------------
4135  * composite_over_pixbuf_0565
4136  */
4137
4138 static void
4139 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4140                                  pixman_op_t              op,
4141                                  pixman_image_t *         src_image,
4142                                  pixman_image_t *         mask_image,
4143                                  pixman_image_t *         dst_image,
4144                                  int32_t                  src_x,
4145                                  int32_t                  src_y,
4146                                  int32_t                  mask_x,
4147                                  int32_t                  mask_y,
4148                                  int32_t                  dest_x,
4149                                  int32_t                  dest_y,
4150                                  int32_t                  width,
4151                                  int32_t                  height)
4152 {
4153     uint16_t    *dst_line, *dst, d;
4154     uint32_t    *src_line, *src, s;
4155     int dst_stride, src_stride;
4156     int32_t w;
4157     uint32_t opaque, zero;
4158
4159     __m64 ms;
4160     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4161     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4162
4163     PIXMAN_IMAGE_GET_LINE (
4164         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4165     PIXMAN_IMAGE_GET_LINE (
4166         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4167
4168 #if 0
4169     /* FIXME
4170      *
4171      * I copy the code from MMX one and keep the fixme.
4172      * If it's a problem there, probably is a problem here.
4173      */
4174     assert (src_image->drawable == mask_image->drawable);
4175 #endif
4176
4177     while (height--)
4178     {
4179         dst = dst_line;
4180         dst_line += dst_stride;
4181         src = src_line;
4182         src_line += src_stride;
4183         w = width;
4184
4185         while (w && (unsigned long)dst & 15)
4186         {
4187             s = *src++;
4188             d = *dst;
4189
4190             ms = unpack_32_1x64 (s);
4191
4192             *dst++ = pack_565_32_16 (
4193                 pack_1x64_32 (
4194                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4195             w--;
4196         }
4197
4198         while (w >= 8)
4199         {
4200             /* First round */
4201             xmm_src = load_128_unaligned ((__m128i*)src);
4202             xmm_dst = load_128_aligned  ((__m128i*)dst);
4203
4204             opaque = is_opaque (xmm_src);
4205             zero = is_zero (xmm_src);
4206
4207             unpack_565_128_4x128 (xmm_dst,
4208                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4209             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4210
4211             /* preload next round*/
4212             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4213
4214             if (opaque)
4215             {
4216                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4217                                      &xmm_dst0, &xmm_dst1);
4218             }
4219             else if (!zero)
4220             {
4221                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4222                                         &xmm_dst0, &xmm_dst1);
4223             }
4224
4225             /* Second round */
4226             opaque = is_opaque (xmm_src);
4227             zero = is_zero (xmm_src);
4228
4229             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4230
4231             if (opaque)
4232             {
4233                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4234                                      &xmm_dst2, &xmm_dst3);
4235             }
4236             else if (!zero)
4237             {
4238                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4239                                         &xmm_dst2, &xmm_dst3);
4240             }
4241
4242             save_128_aligned (
4243                 (__m128i*)dst, pack_565_4x128_128 (
4244                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4245
4246             w -= 8;
4247             src += 8;
4248             dst += 8;
4249         }
4250
4251         while (w)
4252         {
4253             s = *src++;
4254             d = *dst;
4255
4256             ms = unpack_32_1x64 (s);
4257
4258             *dst++ = pack_565_32_16 (
4259                 pack_1x64_32 (
4260                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4261             w--;
4262         }
4263     }
4264
4265     _mm_empty ();
4266 }
4267
4268 /* -------------------------------------------------------------------------
4269  * composite_over_pixbuf_8888
4270  */
4271
4272 static void
4273 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4274                                  pixman_op_t              op,
4275                                  pixman_image_t *         src_image,
4276                                  pixman_image_t *         mask_image,
4277                                  pixman_image_t *         dst_image,
4278                                  int32_t                  src_x,
4279                                  int32_t                  src_y,
4280                                  int32_t                  mask_x,
4281                                  int32_t                  mask_y,
4282                                  int32_t                  dest_x,
4283                                  int32_t                  dest_y,
4284                                  int32_t                  width,
4285                                  int32_t                  height)
4286 {
4287     uint32_t    *dst_line, *dst, d;
4288     uint32_t    *src_line, *src, s;
4289     int dst_stride, src_stride;
4290     int32_t w;
4291     uint32_t opaque, zero;
4292
4293     __m128i xmm_src_lo, xmm_src_hi;
4294     __m128i xmm_dst_lo, xmm_dst_hi;
4295
4296     PIXMAN_IMAGE_GET_LINE (
4297         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4298     PIXMAN_IMAGE_GET_LINE (
4299         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4300
4301 #if 0
4302     /* FIXME
4303      *
4304      * I copy the code from MMX one and keep the fixme.
4305      * If it's a problem there, probably is a problem here.
4306      */
4307     assert (src_image->drawable == mask_image->drawable);
4308 #endif
4309
4310     while (height--)
4311     {
4312         dst = dst_line;
4313         dst_line += dst_stride;
4314         src = src_line;
4315         src_line += src_stride;
4316         w = width;
4317
4318         while (w && (unsigned long)dst & 15)
4319         {
4320             s = *src++;
4321             d = *dst;
4322
4323             *dst++ = pack_1x64_32 (
4324                 over_rev_non_pre_1x64 (
4325                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4326
4327             w--;
4328         }
4329
4330         while (w >= 4)
4331         {
4332             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4333
4334             opaque = is_opaque (xmm_src_hi);
4335             zero = is_zero (xmm_src_hi);
4336
4337             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4338
4339             if (opaque)
4340             {
4341                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4342                                      &xmm_dst_lo, &xmm_dst_hi);
4343
4344                 save_128_aligned (
4345                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4346             }
4347             else if (!zero)
4348             {
4349                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4350
4351                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4352
4353                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4354                                         &xmm_dst_lo, &xmm_dst_hi);
4355
4356                 save_128_aligned (
4357                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4358             }
4359
4360             w -= 4;
4361             dst += 4;
4362             src += 4;
4363         }
4364
4365         while (w)
4366         {
4367             s = *src++;
4368             d = *dst;
4369
4370             *dst++ = pack_1x64_32 (
4371                 over_rev_non_pre_1x64 (
4372                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4373
4374             w--;
4375         }
4376     }
4377
4378     _mm_empty ();
4379 }
4380
4381 /* -------------------------------------------------------------------------------------------------
4382  * composite_over_n_8888_0565_ca
4383  */
4384
4385 static void
4386 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4387                                     pixman_op_t              op,
4388                                     pixman_image_t *         src_image,
4389                                     pixman_image_t *         mask_image,
4390                                     pixman_image_t *         dst_image,
4391                                     int32_t                  src_x,
4392                                     int32_t                  src_y,
4393                                     int32_t                  mask_x,
4394                                     int32_t                  mask_y,
4395                                     int32_t                  dest_x,
4396                                     int32_t                  dest_y,
4397                                     int32_t                  width,
4398                                     int32_t                  height)
4399 {
4400     uint32_t src;
4401     uint16_t    *dst_line, *dst, d;
4402     uint32_t    *mask_line, *mask, m;
4403     int dst_stride, mask_stride;
4404     int w;
4405     uint32_t pack_cmp;
4406
4407     __m128i xmm_src, xmm_alpha;
4408     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4409     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4410
4411     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4412
4413     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4414
4415     if (src == 0)
4416         return;
4417
4418     PIXMAN_IMAGE_GET_LINE (
4419         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4420     PIXMAN_IMAGE_GET_LINE (
4421         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4422
4423     xmm_src = expand_pixel_32_1x128 (src);
4424     xmm_alpha = expand_alpha_1x128 (xmm_src);
4425     mmx_src = _mm_movepi64_pi64 (xmm_src);
4426     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4427
4428     while (height--)
4429     {
4430         w = width;
4431         mask = mask_line;
4432         dst = dst_line;
4433         mask_line += mask_stride;
4434         dst_line += dst_stride;
4435
4436         while (w && ((unsigned long)dst & 15))
4437         {
4438             m = *(uint32_t *) mask;
4439
4440             if (m)
4441             {
4442                 d = *dst;
4443                 mmx_mask = unpack_32_1x64 (m);
4444                 mmx_dest = expand565_16_1x64 (d);
4445
4446                 *dst = pack_565_32_16 (
4447                     pack_1x64_32 (
4448                         in_over_1x64 (
4449                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4450             }
4451
4452             w--;
4453             dst++;
4454             mask++;
4455         }
4456
4457         while (w >= 8)
4458         {
4459             /* First round */
4460             xmm_mask = load_128_unaligned ((__m128i*)mask);
4461             xmm_dst = load_128_aligned ((__m128i*)dst);
4462
4463             pack_cmp = _mm_movemask_epi8 (
4464                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4465
4466             unpack_565_128_4x128 (xmm_dst,
4467                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4468             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4469
4470             /* preload next round */
4471             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4472
4473             /* preload next round */
4474             if (pack_cmp != 0xffff)
4475             {
4476                 in_over_2x128 (&xmm_src, &xmm_src,
4477                                &xmm_alpha, &xmm_alpha,
4478                                &xmm_mask_lo, &xmm_mask_hi,
4479                                &xmm_dst0, &xmm_dst1);
4480             }
4481
4482             /* Second round */
4483             pack_cmp = _mm_movemask_epi8 (
4484                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4485
4486             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4487
4488             if (pack_cmp != 0xffff)
4489             {
4490                 in_over_2x128 (&xmm_src, &xmm_src,
4491                                &xmm_alpha, &xmm_alpha,
4492                                &xmm_mask_lo, &xmm_mask_hi,
4493                                &xmm_dst2, &xmm_dst3);
4494             }
4495
4496             save_128_aligned (
4497                 (__m128i*)dst, pack_565_4x128_128 (
4498                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4499
4500             w -= 8;
4501             dst += 8;
4502             mask += 8;
4503         }
4504
4505         while (w)
4506         {
4507             m = *(uint32_t *) mask;
4508
4509             if (m)
4510             {
4511                 d = *dst;
4512                 mmx_mask = unpack_32_1x64 (m);
4513                 mmx_dest = expand565_16_1x64 (d);
4514
4515                 *dst = pack_565_32_16 (
4516                     pack_1x64_32 (
4517                         in_over_1x64 (
4518                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4519             }
4520
4521             w--;
4522             dst++;
4523             mask++;
4524         }
4525     }
4526
4527     _mm_empty ();
4528 }
4529
4530 /* -----------------------------------------------------------------------
4531  * composite_in_n_8_8
4532  */
4533
4534 static void
4535 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4536                          pixman_op_t              op,
4537                          pixman_image_t *         src_image,
4538                          pixman_image_t *         mask_image,
4539                          pixman_image_t *         dst_image,
4540                          int32_t                  src_x,
4541                          int32_t                  src_y,
4542                          int32_t                  mask_x,
4543                          int32_t                  mask_y,
4544                          int32_t                  dest_x,
4545                          int32_t                  dest_y,
4546                          int32_t                  width,
4547                          int32_t                  height)
4548 {
4549     uint8_t     *dst_line, *dst;
4550     uint8_t     *mask_line, *mask;
4551     int dst_stride, mask_stride;
4552     uint32_t d, m;
4553     uint32_t src;
4554     uint8_t sa;
4555     int32_t w;
4556
4557     __m128i xmm_alpha;
4558     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4559     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4560
4561     PIXMAN_IMAGE_GET_LINE (
4562         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4563     PIXMAN_IMAGE_GET_LINE (
4564         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4565
4566     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4567
4568     sa = src >> 24;
4569
4570     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4571
4572     while (height--)
4573     {
4574         dst = dst_line;
4575         dst_line += dst_stride;
4576         mask = mask_line;
4577         mask_line += mask_stride;
4578         w = width;
4579
4580         while (w && ((unsigned long)dst & 15))
4581         {
4582             m = (uint32_t) *mask++;
4583             d = (uint32_t) *dst;
4584
4585             *dst++ = (uint8_t) pack_1x64_32 (
4586                 pix_multiply_1x64 (
4587                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4588                                        unpack_32_1x64 (m)),
4589                     unpack_32_1x64 (d)));
4590             w--;
4591         }
4592
4593         while (w >= 16)
4594         {
4595             xmm_mask = load_128_unaligned ((__m128i*)mask);
4596             xmm_dst = load_128_aligned ((__m128i*)dst);
4597
4598             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4599             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4600
4601             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4602                                 &xmm_mask_lo, &xmm_mask_hi,
4603                                 &xmm_mask_lo, &xmm_mask_hi);
4604
4605             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4606                                 &xmm_dst_lo, &xmm_dst_hi,
4607                                 &xmm_dst_lo, &xmm_dst_hi);
4608
4609             save_128_aligned (
4610                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4611
4612             mask += 16;
4613             dst += 16;
4614             w -= 16;
4615         }
4616
4617         while (w)
4618         {
4619             m = (uint32_t) *mask++;
4620             d = (uint32_t) *dst;
4621
4622             *dst++ = (uint8_t) pack_1x64_32 (
4623                 pix_multiply_1x64 (
4624                     pix_multiply_1x64 (
4625                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4626                     unpack_32_1x64 (d)));
4627             w--;
4628         }
4629     }
4630
4631     _mm_empty ();
4632 }
4633
4634 /* -----------------------------------------------------------------------
4635  * composite_in_n_8
4636  */
4637
4638 static void
4639 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4640                        pixman_op_t              op,
4641                        pixman_image_t *         src_image,
4642                        pixman_image_t *         mask_image,
4643                        pixman_image_t *         dst_image,
4644                        int32_t                  src_x,
4645                        int32_t                  src_y,
4646                        int32_t                  mask_x,
4647                        int32_t                  mask_y,
4648                        int32_t                  dest_x,
4649                        int32_t                  dest_y,
4650                        int32_t                  width,
4651                        int32_t                  height)
4652 {
4653     uint8_t     *dst_line, *dst;
4654     int dst_stride;
4655     uint32_t d;
4656     uint32_t src;
4657     int32_t w;
4658
4659     __m128i xmm_alpha;
4660     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4661
4662     PIXMAN_IMAGE_GET_LINE (
4663         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4664
4665     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4666
4667     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4668
4669     src = src >> 24;
4670
4671     if (src == 0xff)
4672         return;
4673
4674     if (src == 0x00)
4675     {
4676         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4677                      8, dest_x, dest_y, width, height, src);
4678
4679         return;
4680     }
4681
4682     while (height--)
4683     {
4684         dst = dst_line;
4685         dst_line += dst_stride;
4686         w = width;
4687
4688         while (w && ((unsigned long)dst & 15))
4689         {
4690             d = (uint32_t) *dst;
4691
4692             *dst++ = (uint8_t) pack_1x64_32 (
4693                 pix_multiply_1x64 (
4694                     _mm_movepi64_pi64 (xmm_alpha),
4695                     unpack_32_1x64 (d)));
4696             w--;
4697         }
4698
4699         while (w >= 16)
4700         {
4701             xmm_dst = load_128_aligned ((__m128i*)dst);
4702
4703             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4704
4705             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4706                                 &xmm_dst_lo, &xmm_dst_hi,
4707                                 &xmm_dst_lo, &xmm_dst_hi);
4708
4709             save_128_aligned (
4710                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4711
4712             dst += 16;
4713             w -= 16;
4714         }
4715
4716         while (w)
4717         {
4718             d = (uint32_t) *dst;
4719
4720             *dst++ = (uint8_t) pack_1x64_32 (
4721                 pix_multiply_1x64 (
4722                     _mm_movepi64_pi64 (xmm_alpha),
4723                     unpack_32_1x64 (d)));
4724             w--;
4725         }
4726     }
4727
4728     _mm_empty ();
4729 }
4730
4731 /* ---------------------------------------------------------------------------
4732  * composite_in_8_8
4733  */
4734
4735 static void
4736 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4737                        pixman_op_t              op,
4738                        pixman_image_t *         src_image,
4739                        pixman_image_t *         mask_image,
4740                        pixman_image_t *         dst_image,
4741                        int32_t                  src_x,
4742                        int32_t                  src_y,
4743                        int32_t                  mask_x,
4744                        int32_t                  mask_y,
4745                        int32_t                  dest_x,
4746                        int32_t                  dest_y,
4747                        int32_t                  width,
4748                        int32_t                  height)
4749 {
4750     uint8_t     *dst_line, *dst;
4751     uint8_t     *src_line, *src;
4752     int src_stride, dst_stride;
4753     int32_t w;
4754     uint32_t s, d;
4755
4756     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4757     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4758
4759     PIXMAN_IMAGE_GET_LINE (
4760         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4761     PIXMAN_IMAGE_GET_LINE (
4762         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4763
4764     while (height--)
4765     {
4766         dst = dst_line;
4767         dst_line += dst_stride;
4768         src = src_line;
4769         src_line += src_stride;
4770         w = width;
4771
4772         while (w && ((unsigned long)dst & 15))
4773         {
4774             s = (uint32_t) *src++;
4775             d = (uint32_t) *dst;
4776
4777             *dst++ = (uint8_t) pack_1x64_32 (
4778                 pix_multiply_1x64 (
4779                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4780             w--;
4781         }
4782
4783         while (w >= 16)
4784         {
4785             xmm_src = load_128_unaligned ((__m128i*)src);
4786             xmm_dst = load_128_aligned ((__m128i*)dst);
4787
4788             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4789             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4790
4791             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4792                                 &xmm_dst_lo, &xmm_dst_hi,
4793                                 &xmm_dst_lo, &xmm_dst_hi);
4794
4795             save_128_aligned (
4796                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4797
4798             src += 16;
4799             dst += 16;
4800             w -= 16;
4801         }
4802
4803         while (w)
4804         {
4805             s = (uint32_t) *src++;
4806             d = (uint32_t) *dst;
4807
4808             *dst++ = (uint8_t) pack_1x64_32 (
4809                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4810             w--;
4811         }
4812     }
4813
4814     _mm_empty ();
4815 }
4816
4817 /* -------------------------------------------------------------------------
4818  * composite_add_n_8_8
4819  */
4820
4821 static void
4822 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4823                           pixman_op_t              op,
4824                           pixman_image_t *         src_image,
4825                           pixman_image_t *         mask_image,
4826                           pixman_image_t *         dst_image,
4827                           int32_t                  src_x,
4828                           int32_t                  src_y,
4829                           int32_t                  mask_x,
4830                           int32_t                  mask_y,
4831                           int32_t                  dest_x,
4832                           int32_t                  dest_y,
4833                           int32_t                  width,
4834                           int32_t                  height)
4835 {
4836     uint8_t     *dst_line, *dst;
4837     uint8_t     *mask_line, *mask;
4838     int dst_stride, mask_stride;
4839     int32_t w;
4840     uint32_t src;
4841     uint8_t sa;
4842     uint32_t m, d;
4843
4844     __m128i xmm_alpha;
4845     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4846     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4847
4848     PIXMAN_IMAGE_GET_LINE (
4849         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4850     PIXMAN_IMAGE_GET_LINE (
4851         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4852
4853     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4854
4855     sa = src >> 24;
4856
4857     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4858
4859     while (height--)
4860     {
4861         dst = dst_line;
4862         dst_line += dst_stride;
4863         mask = mask_line;
4864         mask_line += mask_stride;
4865         w = width;
4866
4867         while (w && ((unsigned long)dst & 15))
4868         {
4869             m = (uint32_t) *mask++;
4870             d = (uint32_t) *dst;
4871
4872             *dst++ = (uint8_t) pack_1x64_32 (
4873                 _mm_adds_pu16 (
4874                     pix_multiply_1x64 (
4875                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4876                     unpack_32_1x64 (d)));
4877             w--;
4878         }
4879
4880         while (w >= 16)
4881         {
4882             xmm_mask = load_128_unaligned ((__m128i*)mask);
4883             xmm_dst = load_128_aligned ((__m128i*)dst);
4884
4885             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4886             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4887
4888             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4889                                 &xmm_mask_lo, &xmm_mask_hi,
4890                                 &xmm_mask_lo, &xmm_mask_hi);
4891
4892             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4893             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4894
4895             save_128_aligned (
4896                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4897
4898             mask += 16;
4899             dst += 16;
4900             w -= 16;
4901         }
4902
4903         while (w)
4904         {
4905             m = (uint32_t) *mask++;
4906             d = (uint32_t) *dst;
4907
4908             *dst++ = (uint8_t) pack_1x64_32 (
4909                 _mm_adds_pu16 (
4910                     pix_multiply_1x64 (
4911                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4912                     unpack_32_1x64 (d)));
4913
4914             w--;
4915         }
4916     }
4917
4918     _mm_empty ();
4919 }
4920
4921 /* -------------------------------------------------------------------------
4922  * composite_add_n_8_8
4923  */
4924
4925 static void
4926 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4927                         pixman_op_t              op,
4928                         pixman_image_t *         src_image,
4929                         pixman_image_t *         mask_image,
4930                         pixman_image_t *         dst_image,
4931                         int32_t                  src_x,
4932                         int32_t                  src_y,
4933                         int32_t                  mask_x,
4934                         int32_t                  mask_y,
4935                         int32_t                  dest_x,
4936                         int32_t                  dest_y,
4937                         int32_t                  width,
4938                         int32_t                  height)
4939 {
4940     uint8_t     *dst_line, *dst;
4941     int dst_stride;
4942     int32_t w;
4943     uint32_t src;
4944
4945     __m128i xmm_src;
4946
4947     PIXMAN_IMAGE_GET_LINE (
4948         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4949
4950     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4951
4952     src >>= 24;
4953
4954     if (src == 0x00)
4955         return;
4956
4957     if (src == 0xff)
4958     {
4959         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4960                      8, dest_x, dest_y, width, height, 0xff);
4961
4962         return;
4963     }
4964
4965     src = (src << 24) | (src << 16) | (src << 8) | src;
4966     xmm_src = _mm_set_epi32 (src, src, src, src);
4967
4968     while (height--)
4969     {
4970         dst = dst_line;
4971         dst_line += dst_stride;
4972         w = width;
4973
4974         while (w && ((unsigned long)dst & 15))
4975         {
4976             *dst = (uint8_t)_mm_cvtsi64_si32 (
4977                 _mm_adds_pu8 (
4978                     _mm_movepi64_pi64 (xmm_src),
4979                     _mm_cvtsi32_si64 (*dst)));
4980
4981             w--;
4982             dst++;
4983         }
4984
4985         while (w >= 16)
4986         {
4987             save_128_aligned (
4988                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4989
4990             dst += 16;
4991             w -= 16;
4992         }
4993
4994         while (w)
4995         {
4996             *dst = (uint8_t)_mm_cvtsi64_si32 (
4997                 _mm_adds_pu8 (
4998                     _mm_movepi64_pi64 (xmm_src),
4999                     _mm_cvtsi32_si64 (*dst)));
5000
5001             w--;
5002             dst++;
5003         }
5004     }
5005
5006     _mm_empty ();
5007 }
5008
5009 /* ----------------------------------------------------------------------
5010  * composite_add_8_8
5011  */
5012
5013 static void
5014 sse2_composite_add_8_8 (pixman_implementation_t *imp,
5015                         pixman_op_t              op,
5016                         pixman_image_t *         src_image,
5017                         pixman_image_t *         mask_image,
5018                         pixman_image_t *         dst_image,
5019                         int32_t                  src_x,
5020                         int32_t                  src_y,
5021                         int32_t                  mask_x,
5022                         int32_t                  mask_y,
5023                         int32_t                  dest_x,
5024                         int32_t                  dest_y,
5025                         int32_t                  width,
5026                         int32_t                  height)
5027 {
5028     uint8_t     *dst_line, *dst;
5029     uint8_t     *src_line, *src;
5030     int dst_stride, src_stride;
5031     int32_t w;
5032     uint16_t t;
5033
5034     PIXMAN_IMAGE_GET_LINE (
5035         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5036     PIXMAN_IMAGE_GET_LINE (
5037         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5038
5039     while (height--)
5040     {
5041         dst = dst_line;
5042         src = src_line;
5043
5044         dst_line += dst_stride;
5045         src_line += src_stride;
5046         w = width;
5047
5048         /* Small head */
5049         while (w && (unsigned long)dst & 3)
5050         {
5051             t = (*dst) + (*src++);
5052             *dst++ = t | (0 - (t >> 8));
5053             w--;
5054         }
5055
5056         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5057
5058         /* Small tail */
5059         dst += w & 0xfffc;
5060         src += w & 0xfffc;
5061
5062         w &= 3;
5063
5064         while (w)
5065         {
5066             t = (*dst) + (*src++);
5067             *dst++ = t | (0 - (t >> 8));
5068             w--;
5069         }
5070     }
5071
5072     _mm_empty ();
5073 }
5074
5075 /* ---------------------------------------------------------------------
5076  * composite_add_8888_8888
5077  */
5078 static void
5079 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5080                               pixman_op_t              op,
5081                               pixman_image_t *         src_image,
5082                               pixman_image_t *         mask_image,
5083                               pixman_image_t *         dst_image,
5084                               int32_t                  src_x,
5085                               int32_t                  src_y,
5086                               int32_t                  mask_x,
5087                               int32_t                  mask_y,
5088                               int32_t                  dest_x,
5089                               int32_t                  dest_y,
5090                               int32_t                  width,
5091                               int32_t                  height)
5092 {
5093     uint32_t    *dst_line, *dst;
5094     uint32_t    *src_line, *src;
5095     int dst_stride, src_stride;
5096
5097     PIXMAN_IMAGE_GET_LINE (
5098         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5099     PIXMAN_IMAGE_GET_LINE (
5100         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5101
5102     while (height--)
5103     {
5104         dst = dst_line;
5105         dst_line += dst_stride;
5106         src = src_line;
5107         src_line += src_stride;
5108
5109         core_combine_add_u_sse2 (dst, src, NULL, width);
5110     }
5111
5112     _mm_empty ();
5113 }
5114
5115 /* -------------------------------------------------------------------------------------------------
5116  * sse2_composite_copy_area
5117  */
5118
5119 static pixman_bool_t
5120 pixman_blt_sse2 (uint32_t *src_bits,
5121                  uint32_t *dst_bits,
5122                  int       src_stride,
5123                  int       dst_stride,
5124                  int       src_bpp,
5125                  int       dst_bpp,
5126                  int       src_x,
5127                  int       src_y,
5128                  int       dst_x,
5129                  int       dst_y,
5130                  int       width,
5131                  int       height)
5132 {
5133     uint8_t *   src_bytes;
5134     uint8_t *   dst_bytes;
5135     int byte_width;
5136
5137     if (src_bpp != dst_bpp)
5138         return FALSE;
5139
5140     if (src_bpp == 16)
5141     {
5142         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5143         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5144         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5145         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5146         byte_width = 2 * width;
5147         src_stride *= 2;
5148         dst_stride *= 2;
5149     }
5150     else if (src_bpp == 32)
5151     {
5152         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5153         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5154         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5155         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5156         byte_width = 4 * width;
5157         src_stride *= 4;
5158         dst_stride *= 4;
5159     }
5160     else
5161     {
5162         return FALSE;
5163     }
5164
5165     while (height--)
5166     {
5167         int w;
5168         uint8_t *s = src_bytes;
5169         uint8_t *d = dst_bytes;
5170         src_bytes += src_stride;
5171         dst_bytes += dst_stride;
5172         w = byte_width;
5173
5174         while (w >= 2 && ((unsigned long)d & 3))
5175         {
5176             *(uint16_t *)d = *(uint16_t *)s;
5177             w -= 2;
5178             s += 2;
5179             d += 2;
5180         }
5181
5182         while (w >= 4 && ((unsigned long)d & 15))
5183         {
5184             *(uint32_t *)d = *(uint32_t *)s;
5185
5186             w -= 4;
5187             s += 4;
5188             d += 4;
5189         }
5190
5191         while (w >= 64)
5192         {
5193             __m128i xmm0, xmm1, xmm2, xmm3;
5194
5195             xmm0 = load_128_unaligned ((__m128i*)(s));
5196             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5197             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5198             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5199
5200             save_128_aligned ((__m128i*)(d),    xmm0);
5201             save_128_aligned ((__m128i*)(d + 16), xmm1);
5202             save_128_aligned ((__m128i*)(d + 32), xmm2);
5203             save_128_aligned ((__m128i*)(d + 48), xmm3);
5204
5205             s += 64;
5206             d += 64;
5207             w -= 64;
5208         }
5209
5210         while (w >= 16)
5211         {
5212             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5213
5214             w -= 16;
5215             d += 16;
5216             s += 16;
5217         }
5218
5219         while (w >= 4)
5220         {
5221             *(uint32_t *)d = *(uint32_t *)s;
5222
5223             w -= 4;
5224             s += 4;
5225             d += 4;
5226         }
5227
5228         if (w >= 2)
5229         {
5230             *(uint16_t *)d = *(uint16_t *)s;
5231             w -= 2;
5232             s += 2;
5233             d += 2;
5234         }
5235     }
5236
5237     _mm_empty ();
5238
5239     return TRUE;
5240 }
5241
5242 static void
5243 sse2_composite_copy_area (pixman_implementation_t *imp,
5244                           pixman_op_t              op,
5245                           pixman_image_t *         src_image,
5246                           pixman_image_t *         mask_image,
5247                           pixman_image_t *         dst_image,
5248                           int32_t                  src_x,
5249                           int32_t                  src_y,
5250                           int32_t                  mask_x,
5251                           int32_t                  mask_y,
5252                           int32_t                  dest_x,
5253                           int32_t                  dest_y,
5254                           int32_t                  width,
5255                           int32_t                  height)
5256 {
5257     pixman_blt_sse2 (src_image->bits.bits,
5258                      dst_image->bits.bits,
5259                      src_image->bits.rowstride,
5260                      dst_image->bits.rowstride,
5261                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5262                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5263                      src_x, src_y, dest_x, dest_y, width, height);
5264 }
5265
5266 static void
5267 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5268                                  pixman_op_t              op,
5269                                  pixman_image_t *         src_image,
5270                                  pixman_image_t *         mask_image,
5271                                  pixman_image_t *         dst_image,
5272                                  int32_t                  src_x,
5273                                  int32_t                  src_y,
5274                                  int32_t                  mask_x,
5275                                  int32_t                  mask_y,
5276                                  int32_t                  dest_x,
5277                                  int32_t                  dest_y,
5278                                  int32_t                  width,
5279                                  int32_t                  height)
5280 {
5281     uint32_t    *src, *src_line, s;
5282     uint32_t    *dst, *dst_line, d;
5283     uint8_t         *mask, *mask_line;
5284     uint32_t m;
5285     int src_stride, mask_stride, dst_stride;
5286     int32_t w;
5287     __m64 ms;
5288
5289     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5290     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5291     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5292
5293     PIXMAN_IMAGE_GET_LINE (
5294         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5295     PIXMAN_IMAGE_GET_LINE (
5296         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5297     PIXMAN_IMAGE_GET_LINE (
5298         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5299
5300     while (height--)
5301     {
5302         src = src_line;
5303         src_line += src_stride;
5304         dst = dst_line;
5305         dst_line += dst_stride;
5306         mask = mask_line;
5307         mask_line += mask_stride;
5308
5309         w = width;
5310
5311         while (w && (unsigned long)dst & 15)
5312         {
5313             s = 0xff000000 | *src++;
5314             m = (uint32_t) *mask++;
5315             d = *dst;
5316             ms = unpack_32_1x64 (s);
5317
5318             if (m != 0xff)
5319             {
5320                 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5321                 __m64 md = unpack_32_1x64 (d);
5322
5323                 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5324             }
5325
5326             *dst++ = pack_1x64_32 (ms);
5327             w--;
5328         }
5329
5330         while (w >= 4)
5331         {
5332             m = *(uint32_t*) mask;
5333             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5334
5335             if (m == 0xffffffff)
5336             {
5337                 save_128_aligned ((__m128i*)dst, xmm_src);
5338             }
5339             else
5340             {
5341                 xmm_dst = load_128_aligned ((__m128i*)dst);
5342
5343                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5344
5345                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5346                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5347                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5348
5349                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5350
5351                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5352
5353                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5354             }
5355
5356             src += 4;
5357             dst += 4;
5358             mask += 4;
5359             w -= 4;
5360         }
5361
5362         while (w)
5363         {
5364             m = (uint32_t) *mask++;
5365
5366             if (m)
5367             {
5368                 s = 0xff000000 | *src;
5369
5370                 if (m == 0xff)
5371                 {
5372                     *dst = s;
5373                 }
5374                 else
5375                 {
5376                     __m64 ma, md, ms;
5377
5378                     d = *dst;
5379
5380                     ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5381                     md = unpack_32_1x64 (d);
5382                     ms = unpack_32_1x64 (s);
5383
5384                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5385                 }
5386
5387             }
5388
5389             src++;
5390             dst++;
5391             w--;
5392         }
5393     }
5394
5395     _mm_empty ();
5396 }
5397
5398 static void
5399 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5400                                  pixman_op_t              op,
5401                                  pixman_image_t *         src_image,
5402                                  pixman_image_t *         mask_image,
5403                                  pixman_image_t *         dst_image,
5404                                  int32_t                  src_x,
5405                                  int32_t                  src_y,
5406                                  int32_t                  mask_x,
5407                                  int32_t                  mask_y,
5408                                  int32_t                  dest_x,
5409                                  int32_t                  dest_y,
5410                                  int32_t                  width,
5411                                  int32_t                  height)
5412 {
5413     uint32_t    *src, *src_line, s;
5414     uint32_t    *dst, *dst_line, d;
5415     uint8_t         *mask, *mask_line;
5416     uint32_t m;
5417     int src_stride, mask_stride, dst_stride;
5418     int32_t w;
5419
5420     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5421     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5422     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5423
5424     PIXMAN_IMAGE_GET_LINE (
5425         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5426     PIXMAN_IMAGE_GET_LINE (
5427         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5428     PIXMAN_IMAGE_GET_LINE (
5429         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5430
5431     while (height--)
5432     {
5433         src = src_line;
5434         src_line += src_stride;
5435         dst = dst_line;
5436         dst_line += dst_stride;
5437         mask = mask_line;
5438         mask_line += mask_stride;
5439
5440         w = width;
5441
5442         while (w && (unsigned long)dst & 15)
5443         {
5444             uint32_t sa;
5445
5446             s = *src++;
5447             m = (uint32_t) *mask++;
5448             d = *dst;
5449
5450             sa = s >> 24;
5451
5452             if (m)
5453             {
5454                 if (sa == 0xff && m == 0xff)
5455                 {
5456                     *dst = s;
5457                 }
5458                 else
5459                 {
5460                     __m64 ms, md, ma, msa;
5461
5462                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5463                     ms = unpack_32_1x64 (s);
5464                     md = unpack_32_1x64 (d);
5465
5466                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5467
5468                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5469                 }
5470             }
5471
5472             dst++;
5473             w--;
5474         }
5475
5476         while (w >= 4)
5477         {
5478             m = *(uint32_t *) mask;
5479
5480             if (m)
5481             {
5482                 xmm_src = load_128_unaligned ((__m128i*)src);
5483
5484                 if (m == 0xffffffff && is_opaque (xmm_src))
5485                 {
5486                     save_128_aligned ((__m128i *)dst, xmm_src);
5487                 }
5488                 else
5489                 {
5490                     xmm_dst = load_128_aligned ((__m128i *)dst);
5491
5492                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5493
5494                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5495                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5496                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5497
5498                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5499                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5500
5501                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5502                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5503
5504                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5505                 }
5506             }
5507
5508             src += 4;
5509             dst += 4;
5510             mask += 4;
5511             w -= 4;
5512         }
5513
5514         while (w)
5515         {
5516             uint32_t sa;
5517
5518             s = *src++;
5519             m = (uint32_t) *mask++;
5520             d = *dst;
5521
5522             sa = s >> 24;
5523
5524             if (m)
5525             {
5526                 if (sa == 0xff && m == 0xff)
5527                 {
5528                     *dst = s;
5529                 }
5530                 else
5531                 {
5532                     __m64 ms, md, ma, msa;
5533
5534                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5535                     ms = unpack_32_1x64 (s);
5536                     md = unpack_32_1x64 (d);
5537
5538                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5539
5540                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5541                 }
5542             }
5543
5544             dst++;
5545             w--;
5546         }
5547     }
5548
5549     _mm_empty ();
5550 }
5551
5552 static void
5553 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5554                                     pixman_op_t              op,
5555                                     pixman_image_t *         src_image,
5556                                     pixman_image_t *         mask_image,
5557                                     pixman_image_t *         dst_image,
5558                                     int32_t                  src_x,
5559                                     int32_t                  src_y,
5560                                     int32_t                  mask_x,
5561                                     int32_t                  mask_y,
5562                                     int32_t                  dest_x,
5563                                     int32_t                  dest_y,
5564                                     int32_t                  width,
5565                                     int32_t                  height)
5566 {
5567     uint32_t src;
5568     uint32_t    *dst_line, *dst;
5569     __m128i xmm_src;
5570     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5571     __m128i xmm_dsta_hi, xmm_dsta_lo;
5572     int dst_stride;
5573     int32_t w;
5574
5575     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5576
5577     if (src == 0)
5578         return;
5579
5580     PIXMAN_IMAGE_GET_LINE (
5581         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5582
5583     xmm_src = expand_pixel_32_1x128 (src);
5584
5585     while (height--)
5586     {
5587         dst = dst_line;
5588
5589         dst_line += dst_stride;
5590         w = width;
5591
5592         while (w && (unsigned long)dst & 15)
5593         {
5594             __m64 vd;
5595
5596             vd = unpack_32_1x64 (*dst);
5597
5598             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5599                                             _mm_movepi64_pi64 (xmm_src)));
5600             w--;
5601             dst++;
5602         }
5603
5604         while (w >= 4)
5605         {
5606             __m128i tmp_lo, tmp_hi;
5607
5608             xmm_dst = load_128_aligned ((__m128i*)dst);
5609
5610             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5611             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5612
5613             tmp_lo = xmm_src;
5614             tmp_hi = xmm_src;
5615
5616             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5617                         &xmm_dsta_lo, &xmm_dsta_hi,
5618                         &tmp_lo, &tmp_hi);
5619
5620             save_128_aligned (
5621                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5622
5623             w -= 4;
5624             dst += 4;
5625         }
5626
5627         while (w)
5628         {
5629             __m64 vd;
5630
5631             vd = unpack_32_1x64 (*dst);
5632
5633             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5634                                             _mm_movepi64_pi64 (xmm_src)));
5635             w--;
5636             dst++;
5637         }
5638
5639     }
5640
5641     _mm_empty ();
5642 }
5643
5644 static void
5645 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5646                                     pixman_op_t              op,
5647                                     pixman_image_t *         src_image,
5648                                     pixman_image_t *         mask_image,
5649                                     pixman_image_t *         dst_image,
5650                                     int32_t                  src_x,
5651                                     int32_t                  src_y,
5652                                     int32_t                  mask_x,
5653                                     int32_t                  mask_y,
5654                                     int32_t                  dest_x,
5655                                     int32_t                  dest_y,
5656                                     int32_t                  width,
5657                                     int32_t                  height)
5658 {
5659     uint32_t    *src, *src_line, s;
5660     uint32_t    *dst, *dst_line, d;
5661     uint32_t    *mask, *mask_line;
5662     uint32_t    m;
5663     int src_stride, mask_stride, dst_stride;
5664     int32_t w;
5665
5666     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5667     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5668     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5669
5670     PIXMAN_IMAGE_GET_LINE (
5671         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5672     PIXMAN_IMAGE_GET_LINE (
5673         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5674     PIXMAN_IMAGE_GET_LINE (
5675         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5676
5677     while (height--)
5678     {
5679         src = src_line;
5680         src_line += src_stride;
5681         dst = dst_line;
5682         dst_line += dst_stride;
5683         mask = mask_line;
5684         mask_line += mask_stride;
5685
5686         w = width;
5687
5688         while (w && (unsigned long)dst & 15)
5689         {
5690             uint32_t sa;
5691
5692             s = *src++;
5693             m = (*mask++) >> 24;
5694             d = *dst;
5695
5696             sa = s >> 24;
5697
5698             if (m)
5699             {
5700                 if (sa == 0xff && m == 0xff)
5701                 {
5702                     *dst = s;
5703                 }
5704                 else
5705                 {
5706                     __m64 ms, md, ma, msa;
5707
5708                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5709                     ms = unpack_32_1x64 (s);
5710                     md = unpack_32_1x64 (d);
5711
5712                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5713
5714                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5715                 }
5716             }
5717
5718             dst++;
5719             w--;
5720         }
5721
5722         while (w >= 4)
5723         {
5724             xmm_mask = load_128_unaligned ((__m128i*)mask);
5725
5726             if (!is_transparent (xmm_mask))
5727             {
5728                 xmm_src = load_128_unaligned ((__m128i*)src);
5729
5730                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5731                 {
5732                     save_128_aligned ((__m128i *)dst, xmm_src);
5733                 }
5734                 else
5735                 {
5736                     xmm_dst = load_128_aligned ((__m128i *)dst);
5737
5738                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5739                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5740                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5741
5742                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5743                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5744
5745                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5746                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5747
5748                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5749                 }
5750             }
5751
5752             src += 4;
5753             dst += 4;
5754             mask += 4;
5755             w -= 4;
5756         }
5757
5758         while (w)
5759         {
5760             uint32_t sa;
5761
5762             s = *src++;
5763             m = (*mask++) >> 24;
5764             d = *dst;
5765
5766             sa = s >> 24;
5767
5768             if (m)
5769             {
5770                 if (sa == 0xff && m == 0xff)
5771                 {
5772                     *dst = s;
5773                 }
5774                 else
5775                 {
5776                     __m64 ms, md, ma, msa;
5777
5778                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5779                     ms = unpack_32_1x64 (s);
5780                     md = unpack_32_1x64 (d);
5781
5782                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5783
5784                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5785                 }
5786             }
5787
5788             dst++;
5789             w--;
5790         }
5791     }
5792
5793     _mm_empty ();
5794 }
5795
5796 /* A variant of 'core_combine_over_u_sse2' with minor tweaks */
5797 static force_inline void
5798 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5799                                              const uint32_t* ps,
5800                                              int32_t         w,
5801                                              pixman_fixed_t  vx,
5802                                              pixman_fixed_t  unit_x,
5803                                              pixman_fixed_t  max_vx)
5804 {
5805     uint32_t s, d;
5806     const uint32_t* pm = NULL;
5807
5808     __m128i xmm_dst_lo, xmm_dst_hi;
5809     __m128i xmm_src_lo, xmm_src_hi;
5810     __m128i xmm_alpha_lo, xmm_alpha_hi;
5811
5812     /* Align dst on a 16-byte boundary */
5813     while (w && ((unsigned long)pd & 15))
5814     {
5815         d = *pd;
5816         s = combine1 (ps + (vx >> 16), pm);
5817         vx += unit_x;
5818
5819         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5820         if (pm)
5821             pm++;
5822         w--;
5823     }
5824
5825     while (w >= 4)
5826     {
5827         __m128i tmp;
5828         uint32_t tmp1, tmp2, tmp3, tmp4;
5829
5830         tmp1 = ps[vx >> 16];
5831         vx += unit_x;
5832         tmp2 = ps[vx >> 16];
5833         vx += unit_x;
5834         tmp3 = ps[vx >> 16];
5835         vx += unit_x;
5836         tmp4 = ps[vx >> 16];
5837         vx += unit_x;
5838
5839         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5840
5841         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5842
5843         if (is_opaque (xmm_src_hi))
5844         {
5845             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5846         }
5847         else if (!is_zero (xmm_src_hi))
5848         {
5849             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5850
5851             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5852             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5853
5854             expand_alpha_2x128 (
5855                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5856
5857             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5858                         &xmm_alpha_lo, &xmm_alpha_hi,
5859                         &xmm_dst_lo, &xmm_dst_hi);
5860
5861             /* rebuid the 4 pixel data and save*/
5862             save_128_aligned ((__m128i*)pd,
5863                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5864         }
5865
5866         w -= 4;
5867         pd += 4;
5868         if (pm)
5869             pm += 4;
5870     }
5871
5872     while (w)
5873     {
5874         d = *pd;
5875         s = combine1 (ps + (vx >> 16), pm);
5876         vx += unit_x;
5877
5878         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5879         if (pm)
5880             pm++;
5881
5882         w--;
5883     }
5884     _mm_empty ();
5885 }
5886
5887 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5888                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5889                        uint32_t, uint32_t, COVER)
5890 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5891                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5892                        uint32_t, uint32_t, NONE)
5893 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5894                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5895                        uint32_t, uint32_t, PAD)
5896
5897 static const pixman_fast_path_t sse2_fast_paths[] =
5898 {
5899     /* PIXMAN_OP_OVER */
5900     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5901     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5902     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5903     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5904     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5905     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5906     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5907     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5908     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5909     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5910     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5911     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5912     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5913     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5914     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5915     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5916     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5917     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5918     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5919     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5920     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5921     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5922     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5923     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5924     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5925     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5926     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5927     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5928     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5929     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5930     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5931     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5932     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5933     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5934     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5935     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5936     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5937     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5938     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5939     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5940     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5941     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5942     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5943     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5944     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5945     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5946
5947     /* PIXMAN_OP_OVER_REVERSE */
5948     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5949     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5950
5951     /* PIXMAN_OP_ADD */
5952     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5953     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5954     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5955     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5956     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5957     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5958
5959     /* PIXMAN_OP_SRC */
5960     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5961     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5962     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5963     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5964     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5965     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5966     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5967     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5968     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5969     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5970     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5971     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5972     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5973     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5974
5975     /* PIXMAN_OP_IN */
5976     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5977     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5978     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5979
5980     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5981     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5982     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5983     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5984     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5985     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5986     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5987     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5988     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5989     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5990     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5991     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5992
5993     { PIXMAN_OP_NONE },
5994 };
5995
5996 static pixman_bool_t
5997 sse2_blt (pixman_implementation_t *imp,
5998           uint32_t *               src_bits,
5999           uint32_t *               dst_bits,
6000           int                      src_stride,
6001           int                      dst_stride,
6002           int                      src_bpp,
6003           int                      dst_bpp,
6004           int                      src_x,
6005           int                      src_y,
6006           int                      dst_x,
6007           int                      dst_y,
6008           int                      width,
6009           int                      height)
6010 {
6011     if (!pixman_blt_sse2 (
6012             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6013             src_x, src_y, dst_x, dst_y, width, height))
6014
6015     {
6016         return _pixman_implementation_blt (
6017             imp->delegate,
6018             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6019             src_x, src_y, dst_x, dst_y, width, height);
6020     }
6021
6022     return TRUE;
6023 }
6024
6025 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6026 __attribute__((__force_align_arg_pointer__))
6027 #endif
6028 static pixman_bool_t
6029 sse2_fill (pixman_implementation_t *imp,
6030            uint32_t *               bits,
6031            int                      stride,
6032            int                      bpp,
6033            int                      x,
6034            int                      y,
6035            int                      width,
6036            int                      height,
6037            uint32_t xor)
6038 {
6039     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
6040     {
6041         return _pixman_implementation_fill (
6042             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
6043     }
6044
6045     return TRUE;
6046 }
6047
6048 static uint32_t *
6049 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6050 {
6051     int w = iter->width;
6052     __m128i ff000000 = mask_ff000000;
6053     uint32_t *dst = iter->buffer;
6054     uint32_t *src = (uint32_t *)iter->bits;
6055
6056     iter->bits += iter->stride;
6057
6058     while (w && ((unsigned long)dst) & 0x0f)
6059     {
6060         *dst++ = (*src++) | 0xff000000;
6061         w--;
6062     }
6063
6064     while (w >= 4)
6065     {
6066         save_128_aligned (
6067             (__m128i *)dst, _mm_or_si128 (
6068                 load_128_unaligned ((__m128i *)src), ff000000));
6069
6070         dst += 4;
6071         src += 4;
6072         w -= 4;
6073     }
6074
6075     while (w)
6076     {
6077         *dst++ = (*src++) | 0xff000000;
6078         w--;
6079     }
6080
6081     return iter->buffer;
6082 }
6083
6084 static uint32_t *
6085 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6086 {
6087     int w = iter->width;
6088     uint32_t *dst = iter->buffer;
6089     uint16_t *src = (uint16_t *)iter->bits;
6090     __m128i ff000000 = mask_ff000000;
6091
6092     iter->bits += iter->stride;
6093
6094     while (w && ((unsigned long)dst) & 0x0f)
6095     {
6096         uint16_t s = *src++;
6097
6098         *dst++ = CONVERT_0565_TO_8888 (s);
6099         w--;
6100     }
6101
6102     while (w >= 8)
6103     {
6104         __m128i lo, hi, s;
6105
6106         s = _mm_loadu_si128 ((__m128i *)src);
6107
6108         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6109         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6110
6111         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6112         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6113
6114         dst += 8;
6115         src += 8;
6116         w -= 8;
6117     }
6118
6119     while (w)
6120     {
6121         uint16_t s = *src++;
6122
6123         *dst++ = CONVERT_0565_TO_8888 (s);
6124         w--;
6125     }
6126
6127     return iter->buffer;
6128 }
6129
6130 static uint32_t *
6131 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6132 {
6133     int w = iter->width;
6134     uint32_t *dst = iter->buffer;
6135     uint8_t *src = iter->bits;
6136     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6137
6138     iter->bits += iter->stride;
6139
6140     while (w && (((unsigned long)dst) & 15))
6141     {
6142         *dst++ = *(src++) << 24;
6143         w--;
6144     }
6145
6146     while (w >= 16)
6147     {
6148         xmm0 = _mm_loadu_si128((__m128i *)src);
6149
6150         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
6151         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
6152         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6153         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6154         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6155         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6156
6157         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
6158         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
6159         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
6160         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6161
6162         dst += 16;
6163         src += 16;
6164         w -= 16;
6165     }
6166
6167     while (w)
6168     {
6169         *dst++ = *(src++) << 24;
6170         w--;
6171     }
6172
6173     return iter->buffer;
6174 }
6175
6176 typedef struct
6177 {
6178     pixman_format_code_t        format;
6179     pixman_iter_get_scanline_t  get_scanline;
6180 } fetcher_info_t;
6181
6182 static const fetcher_info_t fetchers[] =
6183 {
6184     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
6185     { PIXMAN_r5g6b5,            sse2_fetch_r5g6b5 },
6186     { PIXMAN_a8,                sse2_fetch_a8 },
6187     { PIXMAN_null }
6188 };
6189
6190 static void
6191 sse2_src_iter_init (pixman_implementation_t *imp,
6192                     pixman_iter_t *iter,
6193                     pixman_image_t *image,
6194                     int x, int y, int width, int height,
6195                     uint8_t *buffer, iter_flags_t flags)
6196 {
6197 #define FLAGS                                                           \
6198     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
6199
6200     if ((flags & ITER_NARROW)                           &&
6201         (image->common.flags & FLAGS) == FLAGS          &&
6202         x >= 0 && y >= 0                                &&
6203         x + width <= image->bits.width                  &&
6204         y + height <= image->bits.height)
6205     {
6206         const fetcher_info_t *f;
6207
6208         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6209         {
6210             if (image->common.extended_format_code == f->format)
6211             {
6212                 uint8_t *b = (uint8_t *)image->bits.bits;
6213                 int s = image->bits.rowstride * 4;
6214
6215                 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6216                 iter->stride = s;
6217                 iter->width = width;
6218                 iter->buffer = (uint32_t *)buffer;
6219
6220                 iter->get_scanline = f->get_scanline;
6221                 return;
6222             }
6223         }
6224     }
6225
6226     _pixman_implementation_src_iter_init (
6227         imp->delegate, iter, image, x, y, width, height, buffer, flags);
6228 }
6229
6230 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6231 __attribute__((__force_align_arg_pointer__))
6232 #endif
6233 pixman_implementation_t *
6234 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6235 {
6236     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6237
6238     /* SSE2 constants */
6239     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6240     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6241     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6242     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6243     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6244     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6245     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6246     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6247     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6248     mask_0080 = create_mask_16_128 (0x0080);
6249     mask_00ff = create_mask_16_128 (0x00ff);
6250     mask_0101 = create_mask_16_128 (0x0101);
6251     mask_ffff = create_mask_16_128 (0xffff);
6252     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6253     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6254
6255     /* MMX constants */
6256     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
6257     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
6258
6259     mask_x0080 = create_mask_16_64 (0x0080);
6260     mask_x00ff = create_mask_16_64 (0x00ff);
6261     mask_x0101 = create_mask_16_64 (0x0101);
6262     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
6263
6264     _mm_empty ();
6265
6266     /* Set up function pointers */
6267
6268     /* SSE code patch for fbcompose.c */
6269     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6270     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6271     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6272     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6273     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6274     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6275     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6276     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6277     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6278     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6279
6280     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6281
6282     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6283     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6284     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6285     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6286     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6287     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6288     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6289     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6290     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6291     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6292     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6293
6294     imp->blt = sse2_blt;
6295     imp->fill = sse2_fill;
6296
6297     imp->src_iter_init = sse2_src_iter_init;
6298
6299     return imp;
6300 }
6301
6302 #endif /* USE_SSE2 */