pixman/pixman-sse2.c

   1 /*
   2  * Copyright © 2008 Rodrigo Kumpera
   3  * Copyright © 2008 André Tupinambá
   4  *
   5  * Permission to use, copy, modify, distribute, and sell this software and its
   6  * documentation for any purpose is hereby granted without fee, provided that
   7  * the above copyright notice appear in all copies and that both that
   8  * copyright notice and this permission notice appear in supporting
   9  * documentation, and that the name of Red Hat not be used in advertising or
  10  * publicity pertaining to distribution of the software without specific,
  11  * written prior permission.  Red Hat makes no representations about the
  12  * suitability of this software for any purpose.  It is provided "as is"
  13  * without express or implied warranty.
  14  *
  15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  22  * SOFTWARE.
  23  *
  24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
  25  *          André Tupinambá (andrelrt@gmail.com)
  26  *
  27  * Based on work by Owen Taylor and Søren Sandmann
  28  */
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include <mmintrin.h>
  34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
  35 #include <emmintrin.h> /* for SSE2 intrinsics */
  36 #include "pixman-private.h"
  37 #include "pixman-combine32.h"
  38 #include "pixman-fast-path.h"
  39
  40 #if defined(_MSC_VER) && defined(_M_AMD64)
  41 /* Windows 64 doesn't allow MMX to be used, so
  42  * the pixman-x64-mmx-emulation.h file contains
  43  * implementations of those MMX intrinsics that
  44  * are used in the SSE2 implementation.
  45  */
  46 #   include "pixman-x64-mmx-emulation.h"
  47 #endif
  48
  49 #ifdef USE_SSE2
  50
  51 /* --------------------------------------------------------------------
  52  * Locals
  53  */
  54
  55 static __m64 mask_x0080;
  56 static __m64 mask_x00ff;
  57 static __m64 mask_x0101;
  58 static __m64 mask_x_alpha;
  59
  60 static __m64 mask_x565_rgb;
  61 static __m64 mask_x565_unpack;
  62
  63 static __m128i mask_0080;
  64 static __m128i mask_00ff;
  65 static __m128i mask_0101;
  66 static __m128i mask_ffff;
  67 static __m128i mask_ff000000;
  68 static __m128i mask_alpha;
  69
  70 static __m128i mask_565_r;
  71 static __m128i mask_565_g1, mask_565_g2;
  72 static __m128i mask_565_b;
  73 static __m128i mask_red;
  74 static __m128i mask_green;
  75 static __m128i mask_blue;
  76
  77 static __m128i mask_565_fix_rb;
  78 static __m128i mask_565_fix_g;
  79
  80 /* ----------------------------------------------------------------------
  81  * SSE2 Inlines
  82  */
  83 static force_inline __m128i
  84 unpack_32_1x128 (uint32_t data)
  85 {
  86     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
  87 }
  88
  89 static force_inline void
  90 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
  91 {
  92     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
  93     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
  94 }
  95
  96 static force_inline __m128i
  97 unpack_565_to_8888 (__m128i lo)
  98 {
  99     __m128i r, g, b, rb, t;
 100
 101     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
 102     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
 103     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
 104
 105     rb = _mm_or_si128 (r, b);
 106     t  = _mm_and_si128 (rb, mask_565_fix_rb);
 107     t  = _mm_srli_epi32 (t, 5);
 108     rb = _mm_or_si128 (rb, t);
 109
 110     t  = _mm_and_si128 (g, mask_565_fix_g);
 111     t  = _mm_srli_epi32 (t, 6);
 112     g  = _mm_or_si128 (g, t);
 113
 114     return _mm_or_si128 (rb, g);
 115 }
 116
 117 static force_inline void
 118 unpack_565_128_4x128 (__m128i  data,
 119                       __m128i* data0,
 120                       __m128i* data1,
 121                       __m128i* data2,
 122                       __m128i* data3)
 123 {
 124     __m128i lo, hi;
 125
 126     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
 127     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
 128
 129     lo = unpack_565_to_8888 (lo);
 130     hi = unpack_565_to_8888 (hi);
 131
 132     unpack_128_2x128 (lo, data0, data1);
 133     unpack_128_2x128 (hi, data2, data3);
 134 }
 135
 136 static force_inline uint16_t
 137 pack_565_32_16 (uint32_t pixel)
 138 {
 139     return (uint16_t) (((pixel >> 8) & 0xf800) |
 140                        ((pixel >> 5) & 0x07e0) |
 141                        ((pixel >> 3) & 0x001f));
 142 }
 143
 144 static force_inline __m128i
 145 pack_2x128_128 (__m128i lo, __m128i hi)
 146 {
 147     return _mm_packus_epi16 (lo, hi);
 148 }
 149
 150 static force_inline __m128i
 151 pack_565_2x128_128 (__m128i lo, __m128i hi)
 152 {
 153     __m128i data;
 154     __m128i r, g1, g2, b;
 155
 156     data = pack_2x128_128 (lo, hi);
 157
 158     r  = _mm_and_si128 (data, mask_565_r);
 159     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
 160     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
 161     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
 162
 163     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
 164 }
 165
 166 static force_inline __m128i
 167 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
 168 {
 169     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
 170                              pack_565_2x128_128 (*xmm2, *xmm3));
 171 }
 172
 173 static force_inline int
 174 is_opaque (__m128i x)
 175 {
 176     __m128i ffs = _mm_cmpeq_epi8 (x, x);
 177
 178     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
 179 }
 180
 181 static force_inline int
 182 is_zero (__m128i x)
 183 {
 184     return _mm_movemask_epi8 (
 185         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
 186 }
 187
 188 static force_inline int
 189 is_transparent (__m128i x)
 190 {
 191     return (_mm_movemask_epi8 (
 192                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
 193 }
 194
 195 static force_inline __m128i
 196 expand_pixel_32_1x128 (uint32_t data)
 197 {
 198     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
 199 }
 200
 201 static force_inline __m128i
 202 expand_alpha_1x128 (__m128i data)
 203 {
 204     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
 205                                                      _MM_SHUFFLE (3, 3, 3, 3)),
 206                                 _MM_SHUFFLE (3, 3, 3, 3));
 207 }
 208
 209 static force_inline void
 210 expand_alpha_2x128 (__m128i  data_lo,
 211                     __m128i  data_hi,
 212                     __m128i* alpha_lo,
 213                     __m128i* alpha_hi)
 214 {
 215     __m128i lo, hi;
 216
 217     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
 218     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
 219
 220     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
 221     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
 222 }
 223
 224 static force_inline void
 225 expand_alpha_rev_2x128 (__m128i  data_lo,
 226                         __m128i  data_hi,
 227                         __m128i* alpha_lo,
 228                         __m128i* alpha_hi)
 229 {
 230     __m128i lo, hi;
 231
 232     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
 233     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
 234     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
 235     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
 236 }
 237
 238 static force_inline void
 239 pix_multiply_2x128 (__m128i* data_lo,
 240                     __m128i* data_hi,
 241                     __m128i* alpha_lo,
 242                     __m128i* alpha_hi,
 243                     __m128i* ret_lo,
 244                     __m128i* ret_hi)
 245 {
 246     __m128i lo, hi;
 247
 248     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
 249     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
 250     lo = _mm_adds_epu16 (lo, mask_0080);
 251     hi = _mm_adds_epu16 (hi, mask_0080);
 252     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
 253     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
 254 }
 255
 256 static force_inline void
 257 pix_add_multiply_2x128 (__m128i* src_lo,
 258                         __m128i* src_hi,
 259                         __m128i* alpha_dst_lo,
 260                         __m128i* alpha_dst_hi,
 261                         __m128i* dst_lo,
 262                         __m128i* dst_hi,
 263                         __m128i* alpha_src_lo,
 264                         __m128i* alpha_src_hi,
 265                         __m128i* ret_lo,
 266                         __m128i* ret_hi)
 267 {
 268     __m128i t1_lo, t1_hi;
 269     __m128i t2_lo, t2_hi;
 270
 271     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
 272     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
 273
 274     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
 275     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
 276 }
 277
 278 static force_inline void
 279 negate_2x128 (__m128i  data_lo,
 280               __m128i  data_hi,
 281               __m128i* neg_lo,
 282               __m128i* neg_hi)
 283 {
 284     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
 285     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
 286 }
 287
 288 static force_inline void
 289 invert_colors_2x128 (__m128i  data_lo,
 290                      __m128i  data_hi,
 291                      __m128i* inv_lo,
 292                      __m128i* inv_hi)
 293 {
 294     __m128i lo, hi;
 295
 296     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
 297     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
 298     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
 299     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
 300 }
 301
 302 static force_inline void
 303 over_2x128 (__m128i* src_lo,
 304             __m128i* src_hi,
 305             __m128i* alpha_lo,
 306             __m128i* alpha_hi,
 307             __m128i* dst_lo,
 308             __m128i* dst_hi)
 309 {
 310     __m128i t1, t2;
 311
 312     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
 313
 314     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
 315
 316     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
 317     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
 318 }
 319
 320 static force_inline void
 321 over_rev_non_pre_2x128 (__m128i  src_lo,
 322                         __m128i  src_hi,
 323                         __m128i* dst_lo,
 324                         __m128i* dst_hi)
 325 {
 326     __m128i lo, hi;
 327     __m128i alpha_lo, alpha_hi;
 328
 329     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
 330
 331     lo = _mm_or_si128 (alpha_lo, mask_alpha);
 332     hi = _mm_or_si128 (alpha_hi, mask_alpha);
 333
 334     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
 335
 336     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
 337
 338     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
 339 }
 340
 341 static force_inline void
 342 in_over_2x128 (__m128i* src_lo,
 343                __m128i* src_hi,
 344                __m128i* alpha_lo,
 345                __m128i* alpha_hi,
 346                __m128i* mask_lo,
 347                __m128i* mask_hi,
 348                __m128i* dst_lo,
 349                __m128i* dst_hi)
 350 {
 351     __m128i s_lo, s_hi;
 352     __m128i a_lo, a_hi;
 353
 354     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
 355     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
 356
 357     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 358 }
 359
 360 /* load 4 pixels from a 16-byte boundary aligned address */
 361 static force_inline __m128i
 362 load_128_aligned (__m128i* src)
 363 {
 364     return _mm_load_si128 (src);
 365 }
 366
 367 /* load 4 pixels from a unaligned address */
 368 static force_inline __m128i
 369 load_128_unaligned (const __m128i* src)
 370 {
 371     return _mm_loadu_si128 (src);
 372 }
 373
 374 /* save 4 pixels using Write Combining memory on a 16-byte
 375  * boundary aligned address
 376  */
 377 static force_inline void
 378 save_128_write_combining (__m128i* dst,
 379                           __m128i  data)
 380 {
 381     _mm_stream_si128 (dst, data);
 382 }
 383
 384 /* save 4 pixels on a 16-byte boundary aligned address */
 385 static force_inline void
 386 save_128_aligned (__m128i* dst,
 387                   __m128i  data)
 388 {
 389     _mm_store_si128 (dst, data);
 390 }
 391
 392 /* save 4 pixels on a unaligned address */
 393 static force_inline void
 394 save_128_unaligned (__m128i* dst,
 395                     __m128i  data)
 396 {
 397     _mm_storeu_si128 (dst, data);
 398 }
 399
 400 /* ------------------------------------------------------------------
 401  * MMX inlines
 402  */
 403
 404 static force_inline __m64
 405 load_32_1x64 (uint32_t data)
 406 {
 407     return _mm_cvtsi32_si64 (data);
 408 }
 409
 410 static force_inline __m64
 411 unpack_32_1x64 (uint32_t data)
 412 {
 413     return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
 414 }
 415
 416 static force_inline __m64
 417 expand_alpha_1x64 (__m64 data)
 418 {
 419     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
 420 }
 421
 422 static force_inline __m64
 423 expand_alpha_rev_1x64 (__m64 data)
 424 {
 425     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 426 }
 427
 428 static force_inline __m64
 429 expand_pixel_8_1x64 (uint8_t data)
 430 {
 431     return _mm_shuffle_pi16 (
 432         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 433 }
 434
 435 static force_inline __m64
 436 pix_multiply_1x64 (__m64 data,
 437                    __m64 alpha)
 438 {
 439     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
 440                                           mask_x0080),
 441                            mask_x0101);
 442 }
 443
 444 static force_inline __m64
 445 pix_add_multiply_1x64 (__m64* src,
 446                        __m64* alpha_dst,
 447                        __m64* dst,
 448                        __m64* alpha_src)
 449 {
 450     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
 451     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
 452
 453     return _mm_adds_pu8 (t1, t2);
 454 }
 455
 456 static force_inline __m64
 457 negate_1x64 (__m64 data)
 458 {
 459     return _mm_xor_si64 (data, mask_x00ff);
 460 }
 461
 462 static force_inline __m64
 463 invert_colors_1x64 (__m64 data)
 464 {
 465     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 466 }
 467
 468 static force_inline __m64
 469 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
 470 {
 471     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
 472 }
 473
 474 static force_inline __m64
 475 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
 476 {
 477     return over_1x64 (pix_multiply_1x64 (*src, *mask),
 478                       pix_multiply_1x64 (*alpha, *mask),
 479                       *dst);
 480 }
 481
 482 static force_inline __m64
 483 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
 484 {
 485     __m64 alpha = expand_alpha_1x64 (src);
 486
 487     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
 488                                          _mm_or_si64 (alpha, mask_x_alpha)),
 489                       alpha,
 490                       dst);
 491 }
 492
 493 static force_inline uint32_t
 494 pack_1x64_32 (__m64 data)
 495 {
 496     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
 497 }
 498
 499 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
 500  *
 501  *    00RR00GG00BB
 502  *
 503  * --- Expanding 565 in the low word ---
 504  *
 505  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
 506  * m = m & (01f0003f001f);
 507  * m = m * (008404100840);
 508  * m = m >> 8;
 509  *
 510  * Note the trick here - the top word is shifted by another nibble to
 511  * avoid it bumping into the middle word
 512  */
 513 static force_inline __m64
 514 expand565_16_1x64 (uint16_t pixel)
 515 {
 516     __m64 p;
 517     __m64 t1, t2;
 518
 519     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
 520
 521     t1 = _mm_slli_si64 (p, 36 - 11);
 522     t2 = _mm_slli_si64 (p, 16 - 5);
 523
 524     p = _mm_or_si64 (t1, p);
 525     p = _mm_or_si64 (t2, p);
 526     p = _mm_and_si64 (p, mask_x565_rgb);
 527     p = _mm_mullo_pi16 (p, mask_x565_unpack);
 528
 529     return _mm_srli_pi16 (p, 8);
 530 }
 531
 532 /* ----------------------------------------------------------------------------
 533  * Compose Core transformations
 534  */
 535 static force_inline uint32_t
 536 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 537 {
 538     uint8_t a;
 539     __m64 ms;
 540
 541     a = src >> 24;
 542
 543     if (a == 0xff)
 544     {
 545         return src;
 546     }
 547     else if (src)
 548     {
 549         ms = unpack_32_1x64 (src);
 550         return pack_1x64_32 (
 551             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
 552     }
 553
 554     return dst;
 555 }
 556
 557 static force_inline uint32_t
 558 combine1 (const uint32_t *ps, const uint32_t *pm)
 559 {
 560     uint32_t s = *ps;
 561
 562     if (pm)
 563     {
 564         __m64 ms, mm;
 565
 566         mm = unpack_32_1x64 (*pm);
 567         mm = expand_alpha_1x64 (mm);
 568
 569         ms = unpack_32_1x64 (s);
 570         ms = pix_multiply_1x64 (ms, mm);
 571
 572         s = pack_1x64_32 (ms);
 573     }
 574
 575     return s;
 576 }
 577
 578 static force_inline __m128i
 579 combine4 (const __m128i *ps, const __m128i *pm)
 580 {
 581     __m128i xmm_src_lo, xmm_src_hi;
 582     __m128i xmm_msk_lo, xmm_msk_hi;
 583     __m128i s;
 584
 585     if (pm)
 586     {
 587         xmm_msk_lo = load_128_unaligned (pm);
 588
 589         if (is_transparent (xmm_msk_lo))
 590             return _mm_setzero_si128 ();
 591     }
 592
 593     s = load_128_unaligned (ps);
 594
 595     if (pm)
 596     {
 597         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
 598         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
 599
 600         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
 601
 602         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
 603                             &xmm_msk_lo, &xmm_msk_hi,
 604                             &xmm_src_lo, &xmm_src_hi);
 605
 606         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
 607     }
 608
 609     return s;
 610 }
 611
 612 static force_inline void
 613 core_combine_over_u_sse2 (uint32_t*       pd,
 614                           const uint32_t* ps,
 615                           const uint32_t* pm,
 616                           int             w)
 617 {
 618     uint32_t s, d;
 619
 620     __m128i xmm_dst_lo, xmm_dst_hi;
 621     __m128i xmm_src_lo, xmm_src_hi;
 622     __m128i xmm_alpha_lo, xmm_alpha_hi;
 623
 624     /* Align dst on a 16-byte boundary */
 625     while (w && ((unsigned long)pd & 15))
 626     {
 627         d = *pd;
 628         s = combine1 (ps, pm);
 629
 630         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
 631         ps++;
 632         if (pm)
 633             pm++;
 634         w--;
 635     }
 636
 637     while (w >= 4)
 638     {
 639         /* I'm loading unaligned because I'm not sure about
 640          * the address alignment.
 641          */
 642         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 643
 644         if (is_opaque (xmm_src_hi))
 645         {
 646             save_128_aligned ((__m128i*)pd, xmm_src_hi);
 647         }
 648         else if (!is_zero (xmm_src_hi))
 649         {
 650             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 651
 652             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 653             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 654
 655             expand_alpha_2x128 (
 656                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
 657
 658             over_2x128 (&xmm_src_lo, &xmm_src_hi,
 659                         &xmm_alpha_lo, &xmm_alpha_hi,
 660                         &xmm_dst_lo, &xmm_dst_hi);
 661
 662             /* rebuid the 4 pixel data and save*/
 663             save_128_aligned ((__m128i*)pd,
 664                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 665         }
 666
 667         w -= 4;
 668         ps += 4;
 669         pd += 4;
 670         if (pm)
 671             pm += 4;
 672     }
 673
 674     while (w)
 675     {
 676         d = *pd;
 677         s = combine1 (ps, pm);
 678
 679         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
 680         ps++;
 681         if (pm)
 682             pm++;
 683
 684         w--;
 685     }
 686 }
 687
 688 static force_inline void
 689 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
 690                                   const uint32_t* ps,
 691                                   const uint32_t* pm,
 692                                   int             w)
 693 {
 694     uint32_t s, d;
 695
 696     __m128i xmm_dst_lo, xmm_dst_hi;
 697     __m128i xmm_src_lo, xmm_src_hi;
 698     __m128i xmm_alpha_lo, xmm_alpha_hi;
 699
 700     /* Align dst on a 16-byte boundary */
 701     while (w &&
 702            ((unsigned long)pd & 15))
 703     {
 704         d = *pd;
 705         s = combine1 (ps, pm);
 706
 707         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
 708         w--;
 709         ps++;
 710         if (pm)
 711             pm++;
 712     }
 713
 714     while (w >= 4)
 715     {
 716         /* I'm loading unaligned because I'm not sure
 717          * about the address alignment.
 718          */
 719         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 720         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 721
 722         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 723         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 724
 725         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
 726                             &xmm_alpha_lo, &xmm_alpha_hi);
 727
 728         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 729                     &xmm_alpha_lo, &xmm_alpha_hi,
 730                     &xmm_src_lo, &xmm_src_hi);
 731
 732         /* rebuid the 4 pixel data and save*/
 733         save_128_aligned ((__m128i*)pd,
 734                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
 735
 736         w -= 4;
 737         ps += 4;
 738         pd += 4;
 739
 740         if (pm)
 741             pm += 4;
 742     }
 743
 744     while (w)
 745     {
 746         d = *pd;
 747         s = combine1 (ps, pm);
 748
 749         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
 750         ps++;
 751         w--;
 752         if (pm)
 753             pm++;
 754     }
 755 }
 756
 757 static force_inline uint32_t
 758 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
 759 {
 760     uint32_t maska = src >> 24;
 761
 762     if (maska == 0)
 763     {
 764         return 0;
 765     }
 766     else if (maska != 0xff)
 767     {
 768         return pack_1x64_32 (
 769             pix_multiply_1x64 (unpack_32_1x64 (dst),
 770                                expand_alpha_1x64 (unpack_32_1x64 (src))));
 771     }
 772
 773     return dst;
 774 }
 775
 776 static force_inline void
 777 core_combine_in_u_sse2 (uint32_t*       pd,
 778                         const uint32_t* ps,
 779                         const uint32_t* pm,
 780                         int             w)
 781 {
 782     uint32_t s, d;
 783
 784     __m128i xmm_src_lo, xmm_src_hi;
 785     __m128i xmm_dst_lo, xmm_dst_hi;
 786
 787     while (w && ((unsigned long) pd & 15))
 788     {
 789         s = combine1 (ps, pm);
 790         d = *pd;
 791
 792         *pd++ = core_combine_in_u_pixelsse2 (d, s);
 793         w--;
 794         ps++;
 795         if (pm)
 796             pm++;
 797     }
 798
 799     while (w >= 4)
 800     {
 801         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 802         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 803
 804         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 805         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 806
 807         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 808         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
 809                             &xmm_dst_lo, &xmm_dst_hi,
 810                             &xmm_dst_lo, &xmm_dst_hi);
 811
 812         save_128_aligned ((__m128i*)pd,
 813                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 814
 815         ps += 4;
 816         pd += 4;
 817         w -= 4;
 818         if (pm)
 819             pm += 4;
 820     }
 821
 822     while (w)
 823     {
 824         s = combine1 (ps, pm);
 825         d = *pd;
 826
 827         *pd++ = core_combine_in_u_pixelsse2 (d, s);
 828         w--;
 829         ps++;
 830         if (pm)
 831             pm++;
 832     }
 833 }
 834
 835 static force_inline void
 836 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
 837                                 const uint32_t* ps,
 838                                 const uint32_t *pm,
 839                                 int             w)
 840 {
 841     uint32_t s, d;
 842
 843     __m128i xmm_src_lo, xmm_src_hi;
 844     __m128i xmm_dst_lo, xmm_dst_hi;
 845
 846     while (w && ((unsigned long) pd & 15))
 847     {
 848         s = combine1 (ps, pm);
 849         d = *pd;
 850
 851         *pd++ = core_combine_in_u_pixelsse2 (s, d);
 852         ps++;
 853         w--;
 854         if (pm)
 855             pm++;
 856     }
 857
 858     while (w >= 4)
 859     {
 860         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 861         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 862
 863         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 864         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 865
 866         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 867         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 868                             &xmm_src_lo, &xmm_src_hi,
 869                             &xmm_dst_lo, &xmm_dst_hi);
 870
 871         save_128_aligned (
 872             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 873
 874         ps += 4;
 875         pd += 4;
 876         w -= 4;
 877         if (pm)
 878             pm += 4;
 879     }
 880
 881     while (w)
 882     {
 883         s = combine1 (ps, pm);
 884         d = *pd;
 885
 886         *pd++ = core_combine_in_u_pixelsse2 (s, d);
 887         w--;
 888         ps++;
 889         if (pm)
 890             pm++;
 891     }
 892 }
 893
 894 static force_inline void
 895 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 896                                  const uint32_t* ps,
 897                                  const uint32_t* pm,
 898                                  int             w)
 899 {
 900     while (w && ((unsigned long) pd & 15))
 901     {
 902         uint32_t s = combine1 (ps, pm);
 903         uint32_t d = *pd;
 904
 905         *pd++ = pack_1x64_32 (
 906             pix_multiply_1x64 (
 907                 unpack_32_1x64 (d), negate_1x64 (
 908                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
 909
 910         if (pm)
 911             pm++;
 912         ps++;
 913         w--;
 914     }
 915
 916     while (w >= 4)
 917     {
 918         __m128i xmm_src_lo, xmm_src_hi;
 919         __m128i xmm_dst_lo, xmm_dst_hi;
 920
 921         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 922         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 923
 924         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 925         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 926
 927         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 928         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 929
 930         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 931                             &xmm_src_lo, &xmm_src_hi,
 932                             &xmm_dst_lo, &xmm_dst_hi);
 933
 934         save_128_aligned (
 935             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 936
 937         ps += 4;
 938         pd += 4;
 939         if (pm)
 940             pm += 4;
 941
 942         w -= 4;
 943     }
 944
 945     while (w)
 946     {
 947         uint32_t s = combine1 (ps, pm);
 948         uint32_t d = *pd;
 949
 950         *pd++ = pack_1x64_32 (
 951             pix_multiply_1x64 (
 952                 unpack_32_1x64 (d), negate_1x64 (
 953                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
 954         ps++;
 955         if (pm)
 956             pm++;
 957         w--;
 958     }
 959 }
 960
 961 static force_inline void
 962 core_combine_out_u_sse2 (uint32_t*       pd,
 963                          const uint32_t* ps,
 964                          const uint32_t* pm,
 965                          int             w)
 966 {
 967     while (w && ((unsigned long) pd & 15))
 968     {
 969         uint32_t s = combine1 (ps, pm);
 970         uint32_t d = *pd;
 971
 972         *pd++ = pack_1x64_32 (
 973             pix_multiply_1x64 (
 974                 unpack_32_1x64 (s), negate_1x64 (
 975                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
 976         w--;
 977         ps++;
 978         if (pm)
 979             pm++;
 980     }
 981
 982     while (w >= 4)
 983     {
 984         __m128i xmm_src_lo, xmm_src_hi;
 985         __m128i xmm_dst_lo, xmm_dst_hi;
 986
 987         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 988         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 989
 990         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 991         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 992
 993         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 994         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 995
 996         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
 997                             &xmm_dst_lo, &xmm_dst_hi,
 998                             &xmm_dst_lo, &xmm_dst_hi);
 999
1000         save_128_aligned (
1001             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1002
1003         ps += 4;
1004         pd += 4;
1005         w -= 4;
1006         if (pm)
1007             pm += 4;
1008     }
1009
1010     while (w)
1011     {
1012         uint32_t s = combine1 (ps, pm);
1013         uint32_t d = *pd;
1014
1015         *pd++ = pack_1x64_32 (
1016             pix_multiply_1x64 (
1017                 unpack_32_1x64 (s), negate_1x64 (
1018                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1019         w--;
1020         ps++;
1021         if (pm)
1022             pm++;
1023     }
1024 }
1025
1026 static force_inline uint32_t
1027 core_combine_atop_u_pixel_sse2 (uint32_t src,
1028                                 uint32_t dst)
1029 {
1030     __m64 s = unpack_32_1x64 (src);
1031     __m64 d = unpack_32_1x64 (dst);
1032
1033     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1034     __m64 da = expand_alpha_1x64 (d);
1035
1036     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1037 }
1038
1039 static force_inline void
1040 core_combine_atop_u_sse2 (uint32_t*       pd,
1041                           const uint32_t* ps,
1042                           const uint32_t* pm,
1043                           int             w)
1044 {
1045     uint32_t s, d;
1046
1047     __m128i xmm_src_lo, xmm_src_hi;
1048     __m128i xmm_dst_lo, xmm_dst_hi;
1049     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1050     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1051
1052     while (w && ((unsigned long) pd & 15))
1053     {
1054         s = combine1 (ps, pm);
1055         d = *pd;
1056
1057         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1058         w--;
1059         ps++;
1060         if (pm)
1061             pm++;
1062     }
1063
1064     while (w >= 4)
1065     {
1066         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1067         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1068
1069         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1070         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1071
1072         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1073                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1074         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1075                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1076
1077         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1078                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1079
1080         pix_add_multiply_2x128 (
1081             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1082             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1083             &xmm_dst_lo, &xmm_dst_hi);
1084
1085         save_128_aligned (
1086             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1087
1088         ps += 4;
1089         pd += 4;
1090         w -= 4;
1091         if (pm)
1092             pm += 4;
1093     }
1094
1095     while (w)
1096     {
1097         s = combine1 (ps, pm);
1098         d = *pd;
1099
1100         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1101         w--;
1102         ps++;
1103         if (pm)
1104             pm++;
1105     }
1106 }
1107
1108 static force_inline uint32_t
1109 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1110                                         uint32_t dst)
1111 {
1112     __m64 s = unpack_32_1x64 (src);
1113     __m64 d = unpack_32_1x64 (dst);
1114
1115     __m64 sa = expand_alpha_1x64 (s);
1116     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1117
1118     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1119 }
1120
1121 static force_inline void
1122 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1123                                   const uint32_t* ps,
1124                                   const uint32_t* pm,
1125                                   int             w)
1126 {
1127     uint32_t s, d;
1128
1129     __m128i xmm_src_lo, xmm_src_hi;
1130     __m128i xmm_dst_lo, xmm_dst_hi;
1131     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1132     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1133
1134     while (w && ((unsigned long) pd & 15))
1135     {
1136         s = combine1 (ps, pm);
1137         d = *pd;
1138
1139         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1140         ps++;
1141         w--;
1142         if (pm)
1143             pm++;
1144     }
1145
1146     while (w >= 4)
1147     {
1148         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1149         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1150
1151         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1152         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1153
1154         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1155                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1156         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1157                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1158
1159         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1160                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1161
1162         pix_add_multiply_2x128 (
1163             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1164             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1165             &xmm_dst_lo, &xmm_dst_hi);
1166
1167         save_128_aligned (
1168             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1169
1170         ps += 4;
1171         pd += 4;
1172         w -= 4;
1173         if (pm)
1174             pm += 4;
1175     }
1176
1177     while (w)
1178     {
1179         s = combine1 (ps, pm);
1180         d = *pd;
1181
1182         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1183         ps++;
1184         w--;
1185         if (pm)
1186             pm++;
1187     }
1188 }
1189
1190 static force_inline uint32_t
1191 core_combine_xor_u_pixel_sse2 (uint32_t src,
1192                                uint32_t dst)
1193 {
1194     __m64 s = unpack_32_1x64 (src);
1195     __m64 d = unpack_32_1x64 (dst);
1196
1197     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1198     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1199
1200     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1201 }
1202
1203 static force_inline void
1204 core_combine_xor_u_sse2 (uint32_t*       dst,
1205                          const uint32_t* src,
1206                          const uint32_t *mask,
1207                          int             width)
1208 {
1209     int w = width;
1210     uint32_t s, d;
1211     uint32_t* pd = dst;
1212     const uint32_t* ps = src;
1213     const uint32_t* pm = mask;
1214
1215     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1216     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1217     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1218     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1219
1220     while (w && ((unsigned long) pd & 15))
1221     {
1222         s = combine1 (ps, pm);
1223         d = *pd;
1224
1225         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1226         w--;
1227         ps++;
1228         if (pm)
1229             pm++;
1230     }
1231
1232     while (w >= 4)
1233     {
1234         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1235         xmm_dst = load_128_aligned ((__m128i*) pd);
1236
1237         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1238         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1239
1240         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1241                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1242         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1243                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1244
1245         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1246                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1247         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1248                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1249
1250         pix_add_multiply_2x128 (
1251             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1252             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1253             &xmm_dst_lo, &xmm_dst_hi);
1254
1255         save_128_aligned (
1256             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1257
1258         ps += 4;
1259         pd += 4;
1260         w -= 4;
1261         if (pm)
1262             pm += 4;
1263     }
1264
1265     while (w)
1266     {
1267         s = combine1 (ps, pm);
1268         d = *pd;
1269
1270         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1271         w--;
1272         ps++;
1273         if (pm)
1274             pm++;
1275     }
1276 }
1277
1278 static force_inline void
1279 core_combine_add_u_sse2 (uint32_t*       dst,
1280                          const uint32_t* src,
1281                          const uint32_t* mask,
1282                          int             width)
1283 {
1284     int w = width;
1285     uint32_t s, d;
1286     uint32_t* pd = dst;
1287     const uint32_t* ps = src;
1288     const uint32_t* pm = mask;
1289
1290     while (w && (unsigned long)pd & 15)
1291     {
1292         s = combine1 (ps, pm);
1293         d = *pd;
1294
1295         ps++;
1296         if (pm)
1297             pm++;
1298         *pd++ = _mm_cvtsi64_si32 (
1299             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1300         w--;
1301     }
1302
1303     while (w >= 4)
1304     {
1305         __m128i s;
1306
1307         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1308
1309         save_128_aligned (
1310             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1311
1312         pd += 4;
1313         ps += 4;
1314         if (pm)
1315             pm += 4;
1316         w -= 4;
1317     }
1318
1319     while (w--)
1320     {
1321         s = combine1 (ps, pm);
1322         d = *pd;
1323
1324         ps++;
1325         *pd++ = _mm_cvtsi64_si32 (
1326             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1327         if (pm)
1328             pm++;
1329     }
1330 }
1331
1332 static force_inline uint32_t
1333 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1334                                     uint32_t dst)
1335 {
1336     __m64 ms = unpack_32_1x64 (src);
1337     __m64 md = unpack_32_1x64 (dst);
1338     uint32_t sa = src >> 24;
1339     uint32_t da = ~dst >> 24;
1340
1341     if (sa > da)
1342     {
1343         ms = pix_multiply_1x64 (
1344             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1345     }
1346
1347     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1348 }
1349
1350 static force_inline void
1351 core_combine_saturate_u_sse2 (uint32_t *      pd,
1352                               const uint32_t *ps,
1353                               const uint32_t *pm,
1354                               int             w)
1355 {
1356     uint32_t s, d;
1357
1358     uint32_t pack_cmp;
1359     __m128i xmm_src, xmm_dst;
1360
1361     while (w && (unsigned long)pd & 15)
1362     {
1363         s = combine1 (ps, pm);
1364         d = *pd;
1365
1366         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1367         w--;
1368         ps++;
1369         if (pm)
1370             pm++;
1371     }
1372
1373     while (w >= 4)
1374     {
1375         xmm_dst = load_128_aligned  ((__m128i*)pd);
1376         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1377
1378         pack_cmp = _mm_movemask_epi8 (
1379             _mm_cmpgt_epi32 (
1380                 _mm_srli_epi32 (xmm_src, 24),
1381                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1382
1383         /* if some alpha src is grater than respective ~alpha dst */
1384         if (pack_cmp)
1385         {
1386             s = combine1 (ps++, pm);
1387             d = *pd;
1388             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1389             if (pm)
1390                 pm++;
1391
1392             s = combine1 (ps++, pm);
1393             d = *pd;
1394             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1395             if (pm)
1396                 pm++;
1397
1398             s = combine1 (ps++, pm);
1399             d = *pd;
1400             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1401             if (pm)
1402                 pm++;
1403
1404             s = combine1 (ps++, pm);
1405             d = *pd;
1406             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1407             if (pm)
1408                 pm++;
1409         }
1410         else
1411         {
1412             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1413
1414             pd += 4;
1415             ps += 4;
1416             if (pm)
1417                 pm += 4;
1418         }
1419
1420         w -= 4;
1421     }
1422
1423     while (w--)
1424     {
1425         s = combine1 (ps, pm);
1426         d = *pd;
1427
1428         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1429         ps++;
1430         if (pm)
1431             pm++;
1432     }
1433 }
1434
1435 static force_inline void
1436 core_combine_src_ca_sse2 (uint32_t*       pd,
1437                           const uint32_t* ps,
1438                           const uint32_t *pm,
1439                           int             w)
1440 {
1441     uint32_t s, m;
1442
1443     __m128i xmm_src_lo, xmm_src_hi;
1444     __m128i xmm_mask_lo, xmm_mask_hi;
1445     __m128i xmm_dst_lo, xmm_dst_hi;
1446
1447     while (w && (unsigned long)pd & 15)
1448     {
1449         s = *ps++;
1450         m = *pm++;
1451         *pd++ = pack_1x64_32 (
1452             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1453         w--;
1454     }
1455
1456     while (w >= 4)
1457     {
1458         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1459         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1460
1461         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1462         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1463
1464         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1465                             &xmm_mask_lo, &xmm_mask_hi,
1466                             &xmm_dst_lo, &xmm_dst_hi);
1467
1468         save_128_aligned (
1469             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1470
1471         ps += 4;
1472         pd += 4;
1473         pm += 4;
1474         w -= 4;
1475     }
1476
1477     while (w)
1478     {
1479         s = *ps++;
1480         m = *pm++;
1481         *pd++ = pack_1x64_32 (
1482             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1483         w--;
1484     }
1485 }
1486
1487 static force_inline uint32_t
1488 core_combine_over_ca_pixel_sse2 (uint32_t src,
1489                                  uint32_t mask,
1490                                  uint32_t dst)
1491 {
1492     __m64 s = unpack_32_1x64 (src);
1493     __m64 expAlpha = expand_alpha_1x64 (s);
1494     __m64 unpk_mask = unpack_32_1x64 (mask);
1495     __m64 unpk_dst  = unpack_32_1x64 (dst);
1496
1497     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1498 }
1499
1500 static force_inline void
1501 core_combine_over_ca_sse2 (uint32_t*       pd,
1502                            const uint32_t* ps,
1503                            const uint32_t *pm,
1504                            int             w)
1505 {
1506     uint32_t s, m, d;
1507
1508     __m128i xmm_alpha_lo, xmm_alpha_hi;
1509     __m128i xmm_src_lo, xmm_src_hi;
1510     __m128i xmm_dst_lo, xmm_dst_hi;
1511     __m128i xmm_mask_lo, xmm_mask_hi;
1512
1513     while (w && (unsigned long)pd & 15)
1514     {
1515         s = *ps++;
1516         m = *pm++;
1517         d = *pd;
1518
1519         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1520         w--;
1521     }
1522
1523     while (w >= 4)
1524     {
1525         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1526         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1527         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1528
1529         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1530         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1531         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1532
1533         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1534                             &xmm_alpha_lo, &xmm_alpha_hi);
1535
1536         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1537                        &xmm_alpha_lo, &xmm_alpha_hi,
1538                        &xmm_mask_lo, &xmm_mask_hi,
1539                        &xmm_dst_lo, &xmm_dst_hi);
1540
1541         save_128_aligned (
1542             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1543
1544         ps += 4;
1545         pd += 4;
1546         pm += 4;
1547         w -= 4;
1548     }
1549
1550     while (w)
1551     {
1552         s = *ps++;
1553         m = *pm++;
1554         d = *pd;
1555
1556         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1557         w--;
1558     }
1559 }
1560
1561 static force_inline uint32_t
1562 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1563                                          uint32_t mask,
1564                                          uint32_t dst)
1565 {
1566     __m64 d = unpack_32_1x64 (dst);
1567
1568     return pack_1x64_32 (
1569         over_1x64 (d, expand_alpha_1x64 (d),
1570                    pix_multiply_1x64 (unpack_32_1x64 (src),
1571                                       unpack_32_1x64 (mask))));
1572 }
1573
1574 static force_inline void
1575 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1576                                    const uint32_t* ps,
1577                                    const uint32_t *pm,
1578                                    int             w)
1579 {
1580     uint32_t s, m, d;
1581
1582     __m128i xmm_alpha_lo, xmm_alpha_hi;
1583     __m128i xmm_src_lo, xmm_src_hi;
1584     __m128i xmm_dst_lo, xmm_dst_hi;
1585     __m128i xmm_mask_lo, xmm_mask_hi;
1586
1587     while (w && (unsigned long)pd & 15)
1588     {
1589         s = *ps++;
1590         m = *pm++;
1591         d = *pd;
1592
1593         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1594         w--;
1595     }
1596
1597     while (w >= 4)
1598     {
1599         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1600         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1601         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1602
1603         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1604         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1605         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1606
1607         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1608                             &xmm_alpha_lo, &xmm_alpha_hi);
1609         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1610                             &xmm_mask_lo, &xmm_mask_hi,
1611                             &xmm_mask_lo, &xmm_mask_hi);
1612
1613         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1614                     &xmm_alpha_lo, &xmm_alpha_hi,
1615                     &xmm_mask_lo, &xmm_mask_hi);
1616
1617         save_128_aligned (
1618             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1619
1620         ps += 4;
1621         pd += 4;
1622         pm += 4;
1623         w -= 4;
1624     }
1625
1626     while (w)
1627     {
1628         s = *ps++;
1629         m = *pm++;
1630         d = *pd;
1631
1632         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1633         w--;
1634     }
1635 }
1636
1637 static force_inline void
1638 core_combine_in_ca_sse2 (uint32_t *      pd,
1639                          const uint32_t *ps,
1640                          const uint32_t *pm,
1641                          int             w)
1642 {
1643     uint32_t s, m, d;
1644
1645     __m128i xmm_alpha_lo, xmm_alpha_hi;
1646     __m128i xmm_src_lo, xmm_src_hi;
1647     __m128i xmm_dst_lo, xmm_dst_hi;
1648     __m128i xmm_mask_lo, xmm_mask_hi;
1649
1650     while (w && (unsigned long)pd & 15)
1651     {
1652         s = *ps++;
1653         m = *pm++;
1654         d = *pd;
1655
1656         *pd++ = pack_1x64_32 (
1657             pix_multiply_1x64 (
1658                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1659                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1660
1661         w--;
1662     }
1663
1664     while (w >= 4)
1665     {
1666         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1667         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1668         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1669
1670         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1671         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1672         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1673
1674         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1675                             &xmm_alpha_lo, &xmm_alpha_hi);
1676
1677         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1678                             &xmm_mask_lo, &xmm_mask_hi,
1679                             &xmm_dst_lo, &xmm_dst_hi);
1680
1681         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1682                             &xmm_alpha_lo, &xmm_alpha_hi,
1683                             &xmm_dst_lo, &xmm_dst_hi);
1684
1685         save_128_aligned (
1686             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1687
1688         ps += 4;
1689         pd += 4;
1690         pm += 4;
1691         w -= 4;
1692     }
1693
1694     while (w)
1695     {
1696         s = *ps++;
1697         m = *pm++;
1698         d = *pd;
1699
1700         *pd++ = pack_1x64_32 (
1701             pix_multiply_1x64 (
1702                 pix_multiply_1x64 (
1703                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1704                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1705
1706         w--;
1707     }
1708 }
1709
1710 static force_inline void
1711 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1712                                  const uint32_t *ps,
1713                                  const uint32_t *pm,
1714                                  int             w)
1715 {
1716     uint32_t s, m, d;
1717
1718     __m128i xmm_alpha_lo, xmm_alpha_hi;
1719     __m128i xmm_src_lo, xmm_src_hi;
1720     __m128i xmm_dst_lo, xmm_dst_hi;
1721     __m128i xmm_mask_lo, xmm_mask_hi;
1722
1723     while (w && (unsigned long)pd & 15)
1724     {
1725         s = *ps++;
1726         m = *pm++;
1727         d = *pd;
1728
1729         *pd++ = pack_1x64_32 (
1730             pix_multiply_1x64 (
1731                 unpack_32_1x64 (d),
1732                 pix_multiply_1x64 (unpack_32_1x64 (m),
1733                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1734         w--;
1735     }
1736
1737     while (w >= 4)
1738     {
1739         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1740         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1741         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1742
1743         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1744         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1745         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1746
1747         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1748                             &xmm_alpha_lo, &xmm_alpha_hi);
1749         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1750                             &xmm_alpha_lo, &xmm_alpha_hi,
1751                             &xmm_alpha_lo, &xmm_alpha_hi);
1752
1753         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1754                             &xmm_alpha_lo, &xmm_alpha_hi,
1755                             &xmm_dst_lo, &xmm_dst_hi);
1756
1757         save_128_aligned (
1758             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1759
1760         ps += 4;
1761         pd += 4;
1762         pm += 4;
1763         w -= 4;
1764     }
1765
1766     while (w)
1767     {
1768         s = *ps++;
1769         m = *pm++;
1770         d = *pd;
1771
1772         *pd++ = pack_1x64_32 (
1773             pix_multiply_1x64 (
1774                 unpack_32_1x64 (d),
1775                 pix_multiply_1x64 (unpack_32_1x64 (m),
1776                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1777         w--;
1778     }
1779 }
1780
1781 static force_inline void
1782 core_combine_out_ca_sse2 (uint32_t *      pd,
1783                           const uint32_t *ps,
1784                           const uint32_t *pm,
1785                           int             w)
1786 {
1787     uint32_t s, m, d;
1788
1789     __m128i xmm_alpha_lo, xmm_alpha_hi;
1790     __m128i xmm_src_lo, xmm_src_hi;
1791     __m128i xmm_dst_lo, xmm_dst_hi;
1792     __m128i xmm_mask_lo, xmm_mask_hi;
1793
1794     while (w && (unsigned long)pd & 15)
1795     {
1796         s = *ps++;
1797         m = *pm++;
1798         d = *pd;
1799
1800         *pd++ = pack_1x64_32 (
1801             pix_multiply_1x64 (
1802                 pix_multiply_1x64 (
1803                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1804                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1805         w--;
1806     }
1807
1808     while (w >= 4)
1809     {
1810         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1811         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1812         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1813
1814         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1815         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1816         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1817
1818         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1819                             &xmm_alpha_lo, &xmm_alpha_hi);
1820         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1821                       &xmm_alpha_lo, &xmm_alpha_hi);
1822
1823         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1824                             &xmm_mask_lo, &xmm_mask_hi,
1825                             &xmm_dst_lo, &xmm_dst_hi);
1826         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1827                             &xmm_alpha_lo, &xmm_alpha_hi,
1828                             &xmm_dst_lo, &xmm_dst_hi);
1829
1830         save_128_aligned (
1831             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1832
1833         ps += 4;
1834         pd += 4;
1835         pm += 4;
1836         w -= 4;
1837     }
1838
1839     while (w)
1840     {
1841         s = *ps++;
1842         m = *pm++;
1843         d = *pd;
1844
1845         *pd++ = pack_1x64_32 (
1846             pix_multiply_1x64 (
1847                 pix_multiply_1x64 (
1848                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1849                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1850
1851         w--;
1852     }
1853 }
1854
1855 static force_inline void
1856 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
1857                                   const uint32_t *ps,
1858                                   const uint32_t *pm,
1859                                   int             w)
1860 {
1861     uint32_t s, m, d;
1862
1863     __m128i xmm_alpha_lo, xmm_alpha_hi;
1864     __m128i xmm_src_lo, xmm_src_hi;
1865     __m128i xmm_dst_lo, xmm_dst_hi;
1866     __m128i xmm_mask_lo, xmm_mask_hi;
1867
1868     while (w && (unsigned long)pd & 15)
1869     {
1870         s = *ps++;
1871         m = *pm++;
1872         d = *pd;
1873
1874         *pd++ = pack_1x64_32 (
1875             pix_multiply_1x64 (
1876                 unpack_32_1x64 (d),
1877                 negate_1x64 (pix_multiply_1x64 (
1878                                  unpack_32_1x64 (m),
1879                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
1880         w--;
1881     }
1882
1883     while (w >= 4)
1884     {
1885         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1886         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1887         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1888
1889         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1890         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1891         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1892
1893         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1894                             &xmm_alpha_lo, &xmm_alpha_hi);
1895
1896         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1897                             &xmm_alpha_lo, &xmm_alpha_hi,
1898                             &xmm_mask_lo, &xmm_mask_hi);
1899
1900         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1901                       &xmm_mask_lo, &xmm_mask_hi);
1902
1903         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1904                             &xmm_mask_lo, &xmm_mask_hi,
1905                             &xmm_dst_lo, &xmm_dst_hi);
1906
1907         save_128_aligned (
1908             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1909
1910         ps += 4;
1911         pd += 4;
1912         pm += 4;
1913         w -= 4;
1914     }
1915
1916     while (w)
1917     {
1918         s = *ps++;
1919         m = *pm++;
1920         d = *pd;
1921
1922         *pd++ = pack_1x64_32 (
1923             pix_multiply_1x64 (
1924                 unpack_32_1x64 (d),
1925                 negate_1x64 (pix_multiply_1x64 (
1926                                  unpack_32_1x64 (m),
1927                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
1928         w--;
1929     }
1930 }
1931
1932 static force_inline uint32_t
1933 core_combine_atop_ca_pixel_sse2 (uint32_t src,
1934                                  uint32_t mask,
1935                                  uint32_t dst)
1936 {
1937     __m64 m = unpack_32_1x64 (mask);
1938     __m64 s = unpack_32_1x64 (src);
1939     __m64 d = unpack_32_1x64 (dst);
1940     __m64 sa = expand_alpha_1x64 (s);
1941     __m64 da = expand_alpha_1x64 (d);
1942
1943     s = pix_multiply_1x64 (s, m);
1944     m = negate_1x64 (pix_multiply_1x64 (m, sa));
1945
1946     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
1947 }
1948
1949 static force_inline void
1950 core_combine_atop_ca_sse2 (uint32_t *      pd,
1951                            const uint32_t *ps,
1952                            const uint32_t *pm,
1953                            int             w)
1954 {
1955     uint32_t s, m, d;
1956
1957     __m128i xmm_src_lo, xmm_src_hi;
1958     __m128i xmm_dst_lo, xmm_dst_hi;
1959     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1960     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1961     __m128i xmm_mask_lo, xmm_mask_hi;
1962
1963     while (w && (unsigned long)pd & 15)
1964     {
1965         s = *ps++;
1966         m = *pm++;
1967         d = *pd;
1968
1969         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
1970         w--;
1971     }
1972
1973     while (w >= 4)
1974     {
1975         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1976         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1977         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1978
1979         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1980         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1981         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1982
1983         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1984                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1985         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1986                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1987
1988         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1989                             &xmm_mask_lo, &xmm_mask_hi,
1990                             &xmm_src_lo, &xmm_src_hi);
1991         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1992                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1993                             &xmm_mask_lo, &xmm_mask_hi);
1994
1995         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1996
1997         pix_add_multiply_2x128 (
1998             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
1999             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2000             &xmm_dst_lo, &xmm_dst_hi);
2001
2002         save_128_aligned (
2003             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2004
2005         ps += 4;
2006         pd += 4;
2007         pm += 4;
2008         w -= 4;
2009     }
2010
2011     while (w)
2012     {
2013         s = *ps++;
2014         m = *pm++;
2015         d = *pd;
2016
2017         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2018         w--;
2019     }
2020 }
2021
2022 static force_inline uint32_t
2023 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2024                                          uint32_t mask,
2025                                          uint32_t dst)
2026 {
2027     __m64 m = unpack_32_1x64 (mask);
2028     __m64 s = unpack_32_1x64 (src);
2029     __m64 d = unpack_32_1x64 (dst);
2030
2031     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2032     __m64 sa = expand_alpha_1x64 (s);
2033
2034     s = pix_multiply_1x64 (s, m);
2035     m = pix_multiply_1x64 (m, sa);
2036
2037     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2038 }
2039
2040 static force_inline void
2041 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2042                                    const uint32_t *ps,
2043                                    const uint32_t *pm,
2044                                    int             w)
2045 {
2046     uint32_t s, m, d;
2047
2048     __m128i xmm_src_lo, xmm_src_hi;
2049     __m128i xmm_dst_lo, xmm_dst_hi;
2050     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2051     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2052     __m128i xmm_mask_lo, xmm_mask_hi;
2053
2054     while (w && (unsigned long)pd & 15)
2055     {
2056         s = *ps++;
2057         m = *pm++;
2058         d = *pd;
2059
2060         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2061         w--;
2062     }
2063
2064     while (w >= 4)
2065     {
2066         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2067         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2068         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2069
2070         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2071         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2072         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2073
2074         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2075                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2076         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2077                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2078
2079         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2080                             &xmm_mask_lo, &xmm_mask_hi,
2081                             &xmm_src_lo, &xmm_src_hi);
2082         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2083                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2084                             &xmm_mask_lo, &xmm_mask_hi);
2085
2086         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2087                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2088
2089         pix_add_multiply_2x128 (
2090             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092             &xmm_dst_lo, &xmm_dst_hi);
2093
2094         save_128_aligned (
2095             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2096
2097         ps += 4;
2098         pd += 4;
2099         pm += 4;
2100         w -= 4;
2101     }
2102
2103     while (w)
2104     {
2105         s = *ps++;
2106         m = *pm++;
2107         d = *pd;
2108
2109         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2110         w--;
2111     }
2112 }
2113
2114 static force_inline uint32_t
2115 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2116                                 uint32_t mask,
2117                                 uint32_t dst)
2118 {
2119     __m64 a = unpack_32_1x64 (mask);
2120     __m64 s = unpack_32_1x64 (src);
2121     __m64 d = unpack_32_1x64 (dst);
2122
2123     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2124                                        a, expand_alpha_1x64 (s)));
2125     __m64 dest      = pix_multiply_1x64 (s, a);
2126     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2127
2128     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2129                                                 &alpha_dst,
2130                                                 &dest,
2131                                                 &alpha_src));
2132 }
2133
2134 static force_inline void
2135 core_combine_xor_ca_sse2 (uint32_t *      pd,
2136                           const uint32_t *ps,
2137                           const uint32_t *pm,
2138                           int             w)
2139 {
2140     uint32_t s, m, d;
2141
2142     __m128i xmm_src_lo, xmm_src_hi;
2143     __m128i xmm_dst_lo, xmm_dst_hi;
2144     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2145     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2146     __m128i xmm_mask_lo, xmm_mask_hi;
2147
2148     while (w && (unsigned long)pd & 15)
2149     {
2150         s = *ps++;
2151         m = *pm++;
2152         d = *pd;
2153
2154         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2155         w--;
2156     }
2157
2158     while (w >= 4)
2159     {
2160         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2163
2164         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2167
2168         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2170         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2171                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2172
2173         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2174                             &xmm_mask_lo, &xmm_mask_hi,
2175                             &xmm_src_lo, &xmm_src_hi);
2176         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2177                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2178                             &xmm_mask_lo, &xmm_mask_hi);
2179
2180         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2181                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2183                       &xmm_mask_lo, &xmm_mask_hi);
2184
2185         pix_add_multiply_2x128 (
2186             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2187             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2188             &xmm_dst_lo, &xmm_dst_hi);
2189
2190         save_128_aligned (
2191             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2192
2193         ps += 4;
2194         pd += 4;
2195         pm += 4;
2196         w -= 4;
2197     }
2198
2199     while (w)
2200     {
2201         s = *ps++;
2202         m = *pm++;
2203         d = *pd;
2204
2205         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2206         w--;
2207     }
2208 }
2209
2210 static force_inline void
2211 core_combine_add_ca_sse2 (uint32_t *      pd,
2212                           const uint32_t *ps,
2213                           const uint32_t *pm,
2214                           int             w)
2215 {
2216     uint32_t s, m, d;
2217
2218     __m128i xmm_src_lo, xmm_src_hi;
2219     __m128i xmm_dst_lo, xmm_dst_hi;
2220     __m128i xmm_mask_lo, xmm_mask_hi;
2221
2222     while (w && (unsigned long)pd & 15)
2223     {
2224         s = *ps++;
2225         m = *pm++;
2226         d = *pd;
2227
2228         *pd++ = pack_1x64_32 (
2229             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2230                                              unpack_32_1x64 (m)),
2231                           unpack_32_1x64 (d)));
2232         w--;
2233     }
2234
2235     while (w >= 4)
2236     {
2237         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2238         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2239         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2240
2241         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2242         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2243         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2244
2245         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2246                             &xmm_mask_lo, &xmm_mask_hi,
2247                             &xmm_src_lo, &xmm_src_hi);
2248
2249         save_128_aligned (
2250             (__m128i*)pd, pack_2x128_128 (
2251                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2252                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2253
2254         ps += 4;
2255         pd += 4;
2256         pm += 4;
2257         w -= 4;
2258     }
2259
2260     while (w)
2261     {
2262         s = *ps++;
2263         m = *pm++;
2264         d = *pd;
2265
2266         *pd++ = pack_1x64_32 (
2267             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2268                                              unpack_32_1x64 (m)),
2269                           unpack_32_1x64 (d)));
2270         w--;
2271     }
2272 }
2273
2274 /* ---------------------------------------------------
2275  * fb_compose_setup_sSE2
2276  */
2277 static force_inline __m64
2278 create_mask_16_64 (uint16_t mask)
2279 {
2280     return _mm_set1_pi16 (mask);
2281 }
2282
2283 static force_inline __m128i
2284 create_mask_16_128 (uint16_t mask)
2285 {
2286     return _mm_set1_epi16 (mask);
2287 }
2288
2289 static force_inline __m64
2290 create_mask_2x32_64 (uint32_t mask0,
2291                      uint32_t mask1)
2292 {
2293     return _mm_set_pi32 (mask0, mask1);
2294 }
2295
2296 /* Work around a code generation bug in Sun Studio 12. */
2297 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2298 # define create_mask_2x32_128(mask0, mask1)                             \
2299     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2300 #else
2301 static force_inline __m128i
2302 create_mask_2x32_128 (uint32_t mask0,
2303                       uint32_t mask1)
2304 {
2305     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2306 }
2307 #endif
2308
2309 /* SSE2 code patch for fbcompose.c */
2310
2311 static void
2312 sse2_combine_over_u (pixman_implementation_t *imp,
2313                      pixman_op_t              op,
2314                      uint32_t *               dst,
2315                      const uint32_t *         src,
2316                      const uint32_t *         mask,
2317                      int                      width)
2318 {
2319     core_combine_over_u_sse2 (dst, src, mask, width);
2320     _mm_empty ();
2321 }
2322
2323 static void
2324 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2325                              pixman_op_t              op,
2326                              uint32_t *               dst,
2327                              const uint32_t *         src,
2328                              const uint32_t *         mask,
2329                              int                      width)
2330 {
2331     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2332     _mm_empty ();
2333 }
2334
2335 static void
2336 sse2_combine_in_u (pixman_implementation_t *imp,
2337                    pixman_op_t              op,
2338                    uint32_t *               dst,
2339                    const uint32_t *         src,
2340                    const uint32_t *         mask,
2341                    int                      width)
2342 {
2343     core_combine_in_u_sse2 (dst, src, mask, width);
2344     _mm_empty ();
2345 }
2346
2347 static void
2348 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2349                            pixman_op_t              op,
2350                            uint32_t *               dst,
2351                            const uint32_t *         src,
2352                            const uint32_t *         mask,
2353                            int                      width)
2354 {
2355     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2356     _mm_empty ();
2357 }
2358
2359 static void
2360 sse2_combine_out_u (pixman_implementation_t *imp,
2361                     pixman_op_t              op,
2362                     uint32_t *               dst,
2363                     const uint32_t *         src,
2364                     const uint32_t *         mask,
2365                     int                      width)
2366 {
2367     core_combine_out_u_sse2 (dst, src, mask, width);
2368     _mm_empty ();
2369 }
2370
2371 static void
2372 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2373                             pixman_op_t              op,
2374                             uint32_t *               dst,
2375                             const uint32_t *         src,
2376                             const uint32_t *         mask,
2377                             int                      width)
2378 {
2379     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2380     _mm_empty ();
2381 }
2382
2383 static void
2384 sse2_combine_atop_u (pixman_implementation_t *imp,
2385                      pixman_op_t              op,
2386                      uint32_t *               dst,
2387                      const uint32_t *         src,
2388                      const uint32_t *         mask,
2389                      int                      width)
2390 {
2391     core_combine_atop_u_sse2 (dst, src, mask, width);
2392     _mm_empty ();
2393 }
2394
2395 static void
2396 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2397                              pixman_op_t              op,
2398                              uint32_t *               dst,
2399                              const uint32_t *         src,
2400                              const uint32_t *         mask,
2401                              int                      width)
2402 {
2403     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2404     _mm_empty ();
2405 }
2406
2407 static void
2408 sse2_combine_xor_u (pixman_implementation_t *imp,
2409                     pixman_op_t              op,
2410                     uint32_t *               dst,
2411                     const uint32_t *         src,
2412                     const uint32_t *         mask,
2413                     int                      width)
2414 {
2415     core_combine_xor_u_sse2 (dst, src, mask, width);
2416     _mm_empty ();
2417 }
2418
2419 static void
2420 sse2_combine_add_u (pixman_implementation_t *imp,
2421                     pixman_op_t              op,
2422                     uint32_t *               dst,
2423                     const uint32_t *         src,
2424                     const uint32_t *         mask,
2425                     int                      width)
2426 {
2427     core_combine_add_u_sse2 (dst, src, mask, width);
2428     _mm_empty ();
2429 }
2430
2431 static void
2432 sse2_combine_saturate_u (pixman_implementation_t *imp,
2433                          pixman_op_t              op,
2434                          uint32_t *               dst,
2435                          const uint32_t *         src,
2436                          const uint32_t *         mask,
2437                          int                      width)
2438 {
2439     core_combine_saturate_u_sse2 (dst, src, mask, width);
2440     _mm_empty ();
2441 }
2442
2443 static void
2444 sse2_combine_src_ca (pixman_implementation_t *imp,
2445                      pixman_op_t              op,
2446                      uint32_t *               dst,
2447                      const uint32_t *         src,
2448                      const uint32_t *         mask,
2449                      int                      width)
2450 {
2451     core_combine_src_ca_sse2 (dst, src, mask, width);
2452     _mm_empty ();
2453 }
2454
2455 static void
2456 sse2_combine_over_ca (pixman_implementation_t *imp,
2457                       pixman_op_t              op,
2458                       uint32_t *               dst,
2459                       const uint32_t *         src,
2460                       const uint32_t *         mask,
2461                       int                      width)
2462 {
2463     core_combine_over_ca_sse2 (dst, src, mask, width);
2464     _mm_empty ();
2465 }
2466
2467 static void
2468 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2469                               pixman_op_t              op,
2470                               uint32_t *               dst,
2471                               const uint32_t *         src,
2472                               const uint32_t *         mask,
2473                               int                      width)
2474 {
2475     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2476     _mm_empty ();
2477 }
2478
2479 static void
2480 sse2_combine_in_ca (pixman_implementation_t *imp,
2481                     pixman_op_t              op,
2482                     uint32_t *               dst,
2483                     const uint32_t *         src,
2484                     const uint32_t *         mask,
2485                     int                      width)
2486 {
2487     core_combine_in_ca_sse2 (dst, src, mask, width);
2488     _mm_empty ();
2489 }
2490
2491 static void
2492 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2493                             pixman_op_t              op,
2494                             uint32_t *               dst,
2495                             const uint32_t *         src,
2496                             const uint32_t *         mask,
2497                             int                      width)
2498 {
2499     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2500     _mm_empty ();
2501 }
2502
2503 static void
2504 sse2_combine_out_ca (pixman_implementation_t *imp,
2505                      pixman_op_t              op,
2506                      uint32_t *               dst,
2507                      const uint32_t *         src,
2508                      const uint32_t *         mask,
2509                      int                      width)
2510 {
2511     core_combine_out_ca_sse2 (dst, src, mask, width);
2512     _mm_empty ();
2513 }
2514
2515 static void
2516 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2517                              pixman_op_t              op,
2518                              uint32_t *               dst,
2519                              const uint32_t *         src,
2520                              const uint32_t *         mask,
2521                              int                      width)
2522 {
2523     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2524     _mm_empty ();
2525 }
2526
2527 static void
2528 sse2_combine_atop_ca (pixman_implementation_t *imp,
2529                       pixman_op_t              op,
2530                       uint32_t *               dst,
2531                       const uint32_t *         src,
2532                       const uint32_t *         mask,
2533                       int                      width)
2534 {
2535     core_combine_atop_ca_sse2 (dst, src, mask, width);
2536     _mm_empty ();
2537 }
2538
2539 static void
2540 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2541                               pixman_op_t              op,
2542                               uint32_t *               dst,
2543                               const uint32_t *         src,
2544                               const uint32_t *         mask,
2545                               int                      width)
2546 {
2547     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2548     _mm_empty ();
2549 }
2550
2551 static void
2552 sse2_combine_xor_ca (pixman_implementation_t *imp,
2553                      pixman_op_t              op,
2554                      uint32_t *               dst,
2555                      const uint32_t *         src,
2556                      const uint32_t *         mask,
2557                      int                      width)
2558 {
2559     core_combine_xor_ca_sse2 (dst, src, mask, width);
2560     _mm_empty ();
2561 }
2562
2563 static void
2564 sse2_combine_add_ca (pixman_implementation_t *imp,
2565                      pixman_op_t              op,
2566                      uint32_t *               dst,
2567                      const uint32_t *         src,
2568                      const uint32_t *         mask,
2569                      int                      width)
2570 {
2571     core_combine_add_ca_sse2 (dst, src, mask, width);
2572     _mm_empty ();
2573 }
2574
2575 /* -------------------------------------------------------------------
2576  * composite_over_n_8888
2577  */
2578
2579 static void
2580 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2581                             pixman_op_t              op,
2582                             pixman_image_t *         src_image,
2583                             pixman_image_t *         mask_image,
2584                             pixman_image_t *         dst_image,
2585                             int32_t                  src_x,
2586                             int32_t                  src_y,
2587                             int32_t                  mask_x,
2588                             int32_t                  mask_y,
2589                             int32_t                  dest_x,
2590                             int32_t                  dest_y,
2591                             int32_t                  width,
2592                             int32_t                  height)
2593 {
2594     uint32_t src;
2595     uint32_t    *dst_line, *dst, d;
2596     int32_t w;
2597     int dst_stride;
2598     __m128i xmm_src, xmm_alpha;
2599     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2600
2601     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2602
2603     if (src == 0)
2604         return;
2605
2606     PIXMAN_IMAGE_GET_LINE (
2607         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2608
2609     xmm_src = expand_pixel_32_1x128 (src);
2610     xmm_alpha = expand_alpha_1x128 (xmm_src);
2611
2612     while (height--)
2613     {
2614         dst = dst_line;
2615
2616         dst_line += dst_stride;
2617         w = width;
2618
2619         while (w && (unsigned long)dst & 15)
2620         {
2621             d = *dst;
2622             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2623                                               _mm_movepi64_pi64 (xmm_alpha),
2624                                               unpack_32_1x64 (d)));
2625             w--;
2626         }
2627
2628         while (w >= 4)
2629         {
2630             xmm_dst = load_128_aligned ((__m128i*)dst);
2631
2632             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2633
2634             over_2x128 (&xmm_src, &xmm_src,
2635                         &xmm_alpha, &xmm_alpha,
2636                         &xmm_dst_lo, &xmm_dst_hi);
2637
2638             /* rebuid the 4 pixel data and save*/
2639             save_128_aligned (
2640                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2641
2642             w -= 4;
2643             dst += 4;
2644         }
2645
2646         while (w)
2647         {
2648             d = *dst;
2649             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2650                                               _mm_movepi64_pi64 (xmm_alpha),
2651                                               unpack_32_1x64 (d)));
2652             w--;
2653         }
2654
2655     }
2656     _mm_empty ();
2657 }
2658
2659 /* ---------------------------------------------------------------------
2660  * composite_over_n_0565
2661  */
2662 static void
2663 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2664                             pixman_op_t              op,
2665                             pixman_image_t *         src_image,
2666                             pixman_image_t *         mask_image,
2667                             pixman_image_t *         dst_image,
2668                             int32_t                  src_x,
2669                             int32_t                  src_y,
2670                             int32_t                  mask_x,
2671                             int32_t                  mask_y,
2672                             int32_t                  dest_x,
2673                             int32_t                  dest_y,
2674                             int32_t                  width,
2675                             int32_t                  height)
2676 {
2677     uint32_t src;
2678     uint16_t    *dst_line, *dst, d;
2679     int32_t w;
2680     int dst_stride;
2681     __m128i xmm_src, xmm_alpha;
2682     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2683
2684     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2685
2686     if (src == 0)
2687         return;
2688
2689     PIXMAN_IMAGE_GET_LINE (
2690         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2691
2692     xmm_src = expand_pixel_32_1x128 (src);
2693     xmm_alpha = expand_alpha_1x128 (xmm_src);
2694
2695     while (height--)
2696     {
2697         dst = dst_line;
2698
2699         dst_line += dst_stride;
2700         w = width;
2701
2702         while (w && (unsigned long)dst & 15)
2703         {
2704             d = *dst;
2705
2706             *dst++ = pack_565_32_16 (
2707                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2708                                          _mm_movepi64_pi64 (xmm_alpha),
2709                                          expand565_16_1x64 (d))));
2710             w--;
2711         }
2712
2713         while (w >= 8)
2714         {
2715             xmm_dst = load_128_aligned ((__m128i*)dst);
2716
2717             unpack_565_128_4x128 (xmm_dst,
2718                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2719
2720             over_2x128 (&xmm_src, &xmm_src,
2721                         &xmm_alpha, &xmm_alpha,
2722                         &xmm_dst0, &xmm_dst1);
2723             over_2x128 (&xmm_src, &xmm_src,
2724                         &xmm_alpha, &xmm_alpha,
2725                         &xmm_dst2, &xmm_dst3);
2726
2727             xmm_dst = pack_565_4x128_128 (
2728                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2729
2730             save_128_aligned ((__m128i*)dst, xmm_dst);
2731
2732             dst += 8;
2733             w -= 8;
2734         }
2735
2736         while (w--)
2737         {
2738             d = *dst;
2739             *dst++ = pack_565_32_16 (
2740                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2741                                          _mm_movepi64_pi64 (xmm_alpha),
2742                                          expand565_16_1x64 (d))));
2743         }
2744     }
2745
2746     _mm_empty ();
2747 }
2748
2749 /* ------------------------------
2750  * composite_add_n_8888_8888_ca
2751  */
2752 static void
2753 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2754                                    pixman_op_t              op,
2755                                    pixman_image_t *         src_image,
2756                                    pixman_image_t *         mask_image,
2757                                    pixman_image_t *         dst_image,
2758                                    int32_t                  src_x,
2759                                    int32_t                  src_y,
2760                                    int32_t                  mask_x,
2761                                    int32_t                  mask_y,
2762                                    int32_t                  dest_x,
2763                                    int32_t                  dest_y,
2764                                    int32_t                  width,
2765                                    int32_t                  height)
2766 {
2767     uint32_t src, srca;
2768     uint32_t    *dst_line, d;
2769     uint32_t    *mask_line, m;
2770     uint32_t pack_cmp;
2771     int dst_stride, mask_stride;
2772
2773     __m128i xmm_src, xmm_alpha;
2774     __m128i xmm_dst;
2775     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2776
2777     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2778
2779     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2780     srca = src >> 24;
2781
2782     if (src == 0)
2783         return;
2784
2785     PIXMAN_IMAGE_GET_LINE (
2786         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2787     PIXMAN_IMAGE_GET_LINE (
2788         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2789
2790     xmm_src = _mm_unpacklo_epi8 (
2791         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2792     xmm_alpha = expand_alpha_1x128 (xmm_src);
2793     mmx_src   = _mm_movepi64_pi64 (xmm_src);
2794     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2795
2796     while (height--)
2797     {
2798         int w = width;
2799         const uint32_t *pm = (uint32_t *)mask_line;
2800         uint32_t *pd = (uint32_t *)dst_line;
2801
2802         dst_line += dst_stride;
2803         mask_line += mask_stride;
2804
2805         while (w && (unsigned long)pd & 15)
2806         {
2807             m = *pm++;
2808
2809             if (m)
2810             {
2811                 d = *pd;
2812
2813                 mmx_mask = unpack_32_1x64 (m);
2814                 mmx_dest = unpack_32_1x64 (d);
2815
2816                 *pd = pack_1x64_32 (
2817                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2818             }
2819
2820             pd++;
2821             w--;
2822         }
2823
2824         while (w >= 4)
2825         {
2826             xmm_mask = load_128_unaligned ((__m128i*)pm);
2827
2828             pack_cmp =
2829                 _mm_movemask_epi8 (
2830                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2831
2832             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2833             if (pack_cmp != 0xffff)
2834             {
2835                 xmm_dst = load_128_aligned ((__m128i*)pd);
2836
2837                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2838
2839                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2840                                     &xmm_mask_lo, &xmm_mask_hi,
2841                                     &xmm_mask_lo, &xmm_mask_hi);
2842                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2843
2844                 save_128_aligned (
2845                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2846             }
2847
2848             pd += 4;
2849             pm += 4;
2850             w -= 4;
2851         }
2852
2853         while (w)
2854         {
2855             m = *pm++;
2856
2857             if (m)
2858             {
2859                 d = *pd;
2860
2861                 mmx_mask = unpack_32_1x64 (m);
2862                 mmx_dest = unpack_32_1x64 (d);
2863
2864                 *pd = pack_1x64_32 (
2865                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2866             }
2867
2868             pd++;
2869             w--;
2870         }
2871     }
2872
2873     _mm_empty ();
2874 }
2875
2876 /* ---------------------------------------------------------------------------
2877  * composite_over_n_8888_8888_ca
2878  */
2879
2880 static void
2881 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2882                                     pixman_op_t              op,
2883                                     pixman_image_t *         src_image,
2884                                     pixman_image_t *         mask_image,
2885                                     pixman_image_t *         dst_image,
2886                                     int32_t                  src_x,
2887                                     int32_t                  src_y,
2888                                     int32_t                  mask_x,
2889                                     int32_t                  mask_y,
2890                                     int32_t                  dest_x,
2891                                     int32_t                  dest_y,
2892                                     int32_t                  width,
2893                                     int32_t                  height)
2894 {
2895     uint32_t src;
2896     uint32_t    *dst_line, d;
2897     uint32_t    *mask_line, m;
2898     uint32_t pack_cmp;
2899     int dst_stride, mask_stride;
2900
2901     __m128i xmm_src, xmm_alpha;
2902     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2903     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2904
2905     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2906
2907     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2908
2909     if (src == 0)
2910         return;
2911
2912     PIXMAN_IMAGE_GET_LINE (
2913         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2914     PIXMAN_IMAGE_GET_LINE (
2915         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2916
2917     xmm_src = _mm_unpacklo_epi8 (
2918         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2919     xmm_alpha = expand_alpha_1x128 (xmm_src);
2920     mmx_src   = _mm_movepi64_pi64 (xmm_src);
2921     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2922
2923     while (height--)
2924     {
2925         int w = width;
2926         const uint32_t *pm = (uint32_t *)mask_line;
2927         uint32_t *pd = (uint32_t *)dst_line;
2928
2929         dst_line += dst_stride;
2930         mask_line += mask_stride;
2931
2932         while (w && (unsigned long)pd & 15)
2933         {
2934             m = *pm++;
2935
2936             if (m)
2937             {
2938                 d = *pd;
2939                 mmx_mask = unpack_32_1x64 (m);
2940                 mmx_dest = unpack_32_1x64 (d);
2941
2942                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
2943                                                   &mmx_alpha,
2944                                                   &mmx_mask,
2945                                                   &mmx_dest));
2946             }
2947
2948             pd++;
2949             w--;
2950         }
2951
2952         while (w >= 4)
2953         {
2954             xmm_mask = load_128_unaligned ((__m128i*)pm);
2955
2956             pack_cmp =
2957                 _mm_movemask_epi8 (
2958                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2959
2960             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2961             if (pack_cmp != 0xffff)
2962             {
2963                 xmm_dst = load_128_aligned ((__m128i*)pd);
2964
2965                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2966                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2967
2968                 in_over_2x128 (&xmm_src, &xmm_src,
2969                                &xmm_alpha, &xmm_alpha,
2970                                &xmm_mask_lo, &xmm_mask_hi,
2971                                &xmm_dst_lo, &xmm_dst_hi);
2972
2973                 save_128_aligned (
2974                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2975             }
2976
2977             pd += 4;
2978             pm += 4;
2979             w -= 4;
2980         }
2981
2982         while (w)
2983         {
2984             m = *pm++;
2985
2986             if (m)
2987             {
2988                 d = *pd;
2989                 mmx_mask = unpack_32_1x64 (m);
2990                 mmx_dest = unpack_32_1x64 (d);
2991
2992                 *pd = pack_1x64_32 (
2993                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2994             }
2995
2996             pd++;
2997             w--;
2998         }
2999     }
3000
3001     _mm_empty ();
3002 }
3003
3004 /*---------------------------------------------------------------------
3005  * composite_over_8888_n_8888
3006  */
3007
3008 static void
3009 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3010                                  pixman_op_t              op,
3011                                  pixman_image_t *         src_image,
3012                                  pixman_image_t *         mask_image,
3013                                  pixman_image_t *         dst_image,
3014                                  int32_t                  src_x,
3015                                  int32_t                  src_y,
3016                                  int32_t                  mask_x,
3017                                  int32_t                  mask_y,
3018                                  int32_t                  dest_x,
3019                                  int32_t                  dest_y,
3020                                  int32_t                  width,
3021                                  int32_t                  height)
3022 {
3023     uint32_t    *dst_line, *dst;
3024     uint32_t    *src_line, *src;
3025     uint32_t mask;
3026     int32_t w;
3027     int dst_stride, src_stride;
3028
3029     __m128i xmm_mask;
3030     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3031     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3032     __m128i xmm_alpha_lo, xmm_alpha_hi;
3033
3034     PIXMAN_IMAGE_GET_LINE (
3035         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3036     PIXMAN_IMAGE_GET_LINE (
3037         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3038
3039     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3040
3041     xmm_mask = create_mask_16_128 (mask >> 24);
3042
3043     while (height--)
3044     {
3045         dst = dst_line;
3046         dst_line += dst_stride;
3047         src = src_line;
3048         src_line += src_stride;
3049         w = width;
3050
3051         while (w && (unsigned long)dst & 15)
3052         {
3053             uint32_t s = *src++;
3054             uint32_t d = *dst;
3055
3056             __m64 ms = unpack_32_1x64 (s);
3057             __m64 alpha    = expand_alpha_1x64 (ms);
3058             __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3059             __m64 alpha_dst = unpack_32_1x64 (d);
3060
3061             *dst++ = pack_1x64_32 (
3062                 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3063
3064             w--;
3065         }
3066
3067         while (w >= 4)
3068         {
3069             xmm_src = load_128_unaligned ((__m128i*)src);
3070             xmm_dst = load_128_aligned ((__m128i*)dst);
3071
3072             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3073             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3074             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3075                                 &xmm_alpha_lo, &xmm_alpha_hi);
3076
3077             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3078                            &xmm_alpha_lo, &xmm_alpha_hi,
3079                            &xmm_mask, &xmm_mask,
3080                            &xmm_dst_lo, &xmm_dst_hi);
3081
3082             save_128_aligned (
3083                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3084
3085             dst += 4;
3086             src += 4;
3087             w -= 4;
3088         }
3089
3090         while (w)
3091         {
3092             uint32_t s = *src++;
3093             uint32_t d = *dst;
3094
3095             __m64 ms = unpack_32_1x64 (s);
3096             __m64 alpha = expand_alpha_1x64 (ms);
3097             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3098             __m64 dest  = unpack_32_1x64 (d);
3099
3100             *dst++ = pack_1x64_32 (
3101                 in_over_1x64 (&ms, &alpha, &mask, &dest));
3102
3103             w--;
3104         }
3105     }
3106
3107     _mm_empty ();
3108 }
3109
3110 /*---------------------------------------------------------------------
3111  * composite_over_8888_n_8888
3112  */
3113
3114 static void
3115 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
3116                               pixman_op_t              op,
3117                               pixman_image_t *         src_image,
3118                               pixman_image_t *         mask_image,
3119                               pixman_image_t *         dst_image,
3120                               int32_t                  src_x,
3121                               int32_t                  src_y,
3122                               int32_t                  mask_x,
3123                               int32_t                  mask_y,
3124                               int32_t                  dest_x,
3125                               int32_t                  dest_y,
3126                               int32_t                  width,
3127                               int32_t                  height)
3128 {
3129     uint32_t    *dst_line, *dst;
3130     uint32_t    *src_line, *src;
3131     int32_t w;
3132     int dst_stride, src_stride;
3133
3134
3135     PIXMAN_IMAGE_GET_LINE (
3136         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3137     PIXMAN_IMAGE_GET_LINE (
3138         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3139
3140     while (height--)
3141     {
3142         dst = dst_line;
3143         dst_line += dst_stride;
3144         src = src_line;
3145         src_line += src_stride;
3146         w = width;
3147
3148         while (w && (unsigned long)dst & 15)
3149         {
3150             *dst++ = *src++ | 0xff000000;
3151             w--;
3152         }
3153
3154         while (w >= 16)
3155         {
3156             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
3157
3158             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
3159             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
3160             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
3161             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
3162
3163             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
3164             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
3165             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
3166             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
3167
3168             dst += 16;
3169             src += 16;
3170             w -= 16;
3171         }
3172
3173         while (w)
3174         {
3175             *dst++ = *src++ | 0xff000000;
3176             w--;
3177         }
3178     }
3179
3180     _mm_empty ();
3181 }
3182
3183 /* ---------------------------------------------------------------------
3184  * composite_over_x888_n_8888
3185  */
3186 static void
3187 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3188                                  pixman_op_t              op,
3189                                  pixman_image_t *         src_image,
3190                                  pixman_image_t *         mask_image,
3191                                  pixman_image_t *         dst_image,
3192                                  int32_t                  src_x,
3193                                  int32_t                  src_y,
3194                                  int32_t                  mask_x,
3195                                  int32_t                  mask_y,
3196                                  int32_t                  dest_x,
3197                                  int32_t                  dest_y,
3198                                  int32_t                  width,
3199                                  int32_t                  height)
3200 {
3201     uint32_t    *dst_line, *dst;
3202     uint32_t    *src_line, *src;
3203     uint32_t mask;
3204     int dst_stride, src_stride;
3205     int32_t w;
3206
3207     __m128i xmm_mask, xmm_alpha;
3208     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3209     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3210
3211     PIXMAN_IMAGE_GET_LINE (
3212         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3213     PIXMAN_IMAGE_GET_LINE (
3214         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3215
3216     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3217
3218     xmm_mask = create_mask_16_128 (mask >> 24);
3219     xmm_alpha = mask_00ff;
3220
3221     while (height--)
3222     {
3223         dst = dst_line;
3224         dst_line += dst_stride;
3225         src = src_line;
3226         src_line += src_stride;
3227         w = width;
3228
3229         while (w && (unsigned long)dst & 15)
3230         {
3231             uint32_t s = (*src++) | 0xff000000;
3232             uint32_t d = *dst;
3233
3234             __m64 src   = unpack_32_1x64 (s);
3235             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3236             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3237             __m64 dest  = unpack_32_1x64 (d);
3238
3239             *dst++ = pack_1x64_32 (
3240                 in_over_1x64 (&src, &alpha, &mask, &dest));
3241
3242             w--;
3243         }
3244
3245         while (w >= 4)
3246         {
3247             xmm_src = _mm_or_si128 (
3248                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3249             xmm_dst = load_128_aligned ((__m128i*)dst);
3250
3251             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3252             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3253
3254             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3255                            &xmm_alpha, &xmm_alpha,
3256                            &xmm_mask, &xmm_mask,
3257                            &xmm_dst_lo, &xmm_dst_hi);
3258
3259             save_128_aligned (
3260                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3261
3262             dst += 4;
3263             src += 4;
3264             w -= 4;
3265
3266         }
3267
3268         while (w)
3269         {
3270             uint32_t s = (*src++) | 0xff000000;
3271             uint32_t d = *dst;
3272
3273             __m64 src  = unpack_32_1x64 (s);
3274             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3275             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3276             __m64 dest  = unpack_32_1x64 (d);
3277
3278             *dst++ = pack_1x64_32 (
3279                 in_over_1x64 (&src, &alpha, &mask, &dest));
3280
3281             w--;
3282         }
3283     }
3284
3285     _mm_empty ();
3286 }
3287
3288 /* --------------------------------------------------------------------
3289  * composite_over_8888_8888
3290  */
3291 static void
3292 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3293                                pixman_op_t              op,
3294                                pixman_image_t *         src_image,
3295                                pixman_image_t *         mask_image,
3296                                pixman_image_t *         dst_image,
3297                                int32_t                  src_x,
3298                                int32_t                  src_y,
3299                                int32_t                  mask_x,
3300                                int32_t                  mask_y,
3301                                int32_t                  dest_x,
3302                                int32_t                  dest_y,
3303                                int32_t                  width,
3304                                int32_t                  height)
3305 {
3306     int dst_stride, src_stride;
3307     uint32_t    *dst_line, *dst;
3308     uint32_t    *src_line, *src;
3309
3310     PIXMAN_IMAGE_GET_LINE (
3311         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3312     PIXMAN_IMAGE_GET_LINE (
3313         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3314
3315     dst = dst_line;
3316     src = src_line;
3317
3318     while (height--)
3319     {
3320         core_combine_over_u_sse2 (dst, src, NULL, width);
3321
3322         dst += dst_stride;
3323         src += src_stride;
3324     }
3325     _mm_empty ();
3326 }
3327
3328 /* ------------------------------------------------------------------
3329  * composite_over_8888_0565
3330  */
3331 static force_inline uint16_t
3332 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3333 {
3334     __m64 ms;
3335
3336     ms = unpack_32_1x64 (src);
3337     return pack_565_32_16 (
3338         pack_1x64_32 (
3339             over_1x64 (
3340                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3341 }
3342
3343 static void
3344 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3345                                pixman_op_t              op,
3346                                pixman_image_t *         src_image,
3347                                pixman_image_t *         mask_image,
3348                                pixman_image_t *         dst_image,
3349                                int32_t                  src_x,
3350                                int32_t                  src_y,
3351                                int32_t                  mask_x,
3352                                int32_t                  mask_y,
3353                                int32_t                  dest_x,
3354                                int32_t                  dest_y,
3355                                int32_t                  width,
3356                                int32_t                  height)
3357 {
3358     uint16_t    *dst_line, *dst, d;
3359     uint32_t    *src_line, *src, s;
3360     int dst_stride, src_stride;
3361     int32_t w;
3362
3363     __m128i xmm_alpha_lo, xmm_alpha_hi;
3364     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3365     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3366
3367     PIXMAN_IMAGE_GET_LINE (
3368         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3369     PIXMAN_IMAGE_GET_LINE (
3370         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3371
3372 #if 0
3373     /* FIXME
3374      *
3375      * I copy the code from MMX one and keep the fixme.
3376      * If it's a problem there, probably is a problem here.
3377      */
3378     assert (src_image->drawable == mask_image->drawable);
3379 #endif
3380
3381     while (height--)
3382     {
3383         dst = dst_line;
3384         src = src_line;
3385
3386         dst_line += dst_stride;
3387         src_line += src_stride;
3388         w = width;
3389
3390         /* Align dst on a 16-byte boundary */
3391         while (w &&
3392                ((unsigned long)dst & 15))
3393         {
3394             s = *src++;
3395             d = *dst;
3396
3397             *dst++ = composite_over_8888_0565pixel (s, d);
3398             w--;
3399         }
3400
3401         /* It's a 8 pixel loop */
3402         while (w >= 8)
3403         {
3404             /* I'm loading unaligned because I'm not sure
3405              * about the address alignment.
3406              */
3407             xmm_src = load_128_unaligned ((__m128i*) src);
3408             xmm_dst = load_128_aligned ((__m128i*) dst);
3409
3410             /* Unpacking */
3411             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3412             unpack_565_128_4x128 (xmm_dst,
3413                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3414             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3415                                 &xmm_alpha_lo, &xmm_alpha_hi);
3416
3417             /* I'm loading next 4 pixels from memory
3418              * before to optimze the memory read.
3419              */
3420             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3421
3422             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3423                         &xmm_alpha_lo, &xmm_alpha_hi,
3424                         &xmm_dst0, &xmm_dst1);
3425
3426             /* Unpacking */
3427             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3428             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3429                                 &xmm_alpha_lo, &xmm_alpha_hi);
3430
3431             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3432                         &xmm_alpha_lo, &xmm_alpha_hi,
3433                         &xmm_dst2, &xmm_dst3);
3434
3435             save_128_aligned (
3436                 (__m128i*)dst, pack_565_4x128_128 (
3437                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3438
3439             w -= 8;
3440             dst += 8;
3441             src += 8;
3442         }
3443
3444         while (w--)
3445         {
3446             s = *src++;
3447             d = *dst;
3448
3449             *dst++ = composite_over_8888_0565pixel (s, d);
3450         }
3451     }
3452
3453     _mm_empty ();
3454 }
3455
3456 /* -----------------------------------------------------------------
3457  * composite_over_n_8_8888
3458  */
3459
3460 static void
3461 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3462                               pixman_op_t              op,
3463                               pixman_image_t *         src_image,
3464                               pixman_image_t *         mask_image,
3465                               pixman_image_t *         dst_image,
3466                               int32_t                  src_x,
3467                               int32_t                  src_y,
3468                               int32_t                  mask_x,
3469                               int32_t                  mask_y,
3470                               int32_t                  dest_x,
3471                               int32_t                  dest_y,
3472                               int32_t                  width,
3473                               int32_t                  height)
3474 {
3475     uint32_t src, srca;
3476     uint32_t *dst_line, *dst;
3477     uint8_t *mask_line, *mask;
3478     int dst_stride, mask_stride;
3479     int32_t w;
3480     uint32_t m, d;
3481
3482     __m128i xmm_src, xmm_alpha, xmm_def;
3483     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3484     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3485
3486     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3487
3488     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3489
3490     srca = src >> 24;
3491     if (src == 0)
3492         return;
3493
3494     PIXMAN_IMAGE_GET_LINE (
3495         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3496     PIXMAN_IMAGE_GET_LINE (
3497         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3498
3499     xmm_def = create_mask_2x32_128 (src, src);
3500     xmm_src = expand_pixel_32_1x128 (src);
3501     xmm_alpha = expand_alpha_1x128 (xmm_src);
3502     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3503     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3504
3505     while (height--)
3506     {
3507         dst = dst_line;
3508         dst_line += dst_stride;
3509         mask = mask_line;
3510         mask_line += mask_stride;
3511         w = width;
3512
3513         while (w && (unsigned long)dst & 15)
3514         {
3515             uint8_t m = *mask++;
3516
3517             if (m)
3518             {
3519                 d = *dst;
3520                 mmx_mask = expand_pixel_8_1x64 (m);
3521                 mmx_dest = unpack_32_1x64 (d);
3522
3523                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3524                                                    &mmx_alpha,
3525                                                    &mmx_mask,
3526                                                    &mmx_dest));
3527             }
3528
3529             w--;
3530             dst++;
3531         }
3532
3533         while (w >= 4)
3534         {
3535             m = *((uint32_t*)mask);
3536
3537             if (srca == 0xff && m == 0xffffffff)
3538             {
3539                 save_128_aligned ((__m128i*)dst, xmm_def);
3540             }
3541             else if (m)
3542             {
3543                 xmm_dst = load_128_aligned ((__m128i*) dst);
3544                 xmm_mask = unpack_32_1x128 (m);
3545                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3546
3547                 /* Unpacking */
3548                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3549                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3550
3551                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3552                                         &xmm_mask_lo, &xmm_mask_hi);
3553
3554                 in_over_2x128 (&xmm_src, &xmm_src,
3555                                &xmm_alpha, &xmm_alpha,
3556                                &xmm_mask_lo, &xmm_mask_hi,
3557                                &xmm_dst_lo, &xmm_dst_hi);
3558
3559                 save_128_aligned (
3560                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3561             }
3562
3563             w -= 4;
3564             dst += 4;
3565             mask += 4;
3566         }
3567
3568         while (w)
3569         {
3570             uint8_t m = *mask++;
3571
3572             if (m)
3573             {
3574                 d = *dst;
3575                 mmx_mask = expand_pixel_8_1x64 (m);
3576                 mmx_dest = unpack_32_1x64 (d);
3577
3578                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3579                                                    &mmx_alpha,
3580                                                    &mmx_mask,
3581                                                    &mmx_dest));
3582             }
3583
3584             w--;
3585             dst++;
3586         }
3587     }
3588
3589     _mm_empty ();
3590 }
3591
3592 /* ----------------------------------------------------------------
3593  * composite_over_n_8_8888
3594  */
3595
3596 pixman_bool_t
3597 pixman_fill_sse2 (uint32_t *bits,
3598                   int       stride,
3599                   int       bpp,
3600                   int       x,
3601                   int       y,
3602                   int       width,
3603                   int       height,
3604                   uint32_t  data)
3605 {
3606     uint32_t byte_width;
3607     uint8_t         *byte_line;
3608
3609     __m128i xmm_def;
3610
3611     if (bpp == 8)
3612     {
3613         uint8_t b;
3614         uint16_t w;
3615
3616         stride = stride * (int) sizeof (uint32_t) / 1;
3617         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3618         byte_width = width;
3619         stride *= 1;
3620
3621         b = data & 0xff;
3622         w = (b << 8) | b;
3623         data = (w << 16) | w;
3624     }
3625     else if (bpp == 16)
3626     {
3627         stride = stride * (int) sizeof (uint32_t) / 2;
3628         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3629         byte_width = 2 * width;
3630         stride *= 2;
3631
3632         data = (data & 0xffff) * 0x00010001;
3633     }
3634     else if (bpp == 32)
3635     {
3636         stride = stride * (int) sizeof (uint32_t) / 4;
3637         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3638         byte_width = 4 * width;
3639         stride *= 4;
3640     }
3641     else
3642     {
3643         return FALSE;
3644     }
3645
3646     xmm_def = create_mask_2x32_128 (data, data);
3647
3648     while (height--)
3649     {
3650         int w;
3651         uint8_t *d = byte_line;
3652         byte_line += stride;
3653         w = byte_width;
3654
3655         while (w >= 1 && ((unsigned long)d & 1))
3656         {
3657             *(uint8_t *)d = data;
3658             w -= 1;
3659             d += 1;
3660         }
3661
3662         while (w >= 2 && ((unsigned long)d & 3))
3663         {
3664             *(uint16_t *)d = data;
3665             w -= 2;
3666             d += 2;
3667         }
3668
3669         while (w >= 4 && ((unsigned long)d & 15))
3670         {
3671             *(uint32_t *)d = data;
3672
3673             w -= 4;
3674             d += 4;
3675         }
3676
3677         while (w >= 128)
3678         {
3679             save_128_aligned ((__m128i*)(d),     xmm_def);
3680             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3681             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3682             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3683             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3684             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3685             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3686             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3687
3688             d += 128;
3689             w -= 128;
3690         }
3691
3692         if (w >= 64)
3693         {
3694             save_128_aligned ((__m128i*)(d),     xmm_def);
3695             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3696             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3697             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3698
3699             d += 64;
3700             w -= 64;
3701         }
3702
3703         if (w >= 32)
3704         {
3705             save_128_aligned ((__m128i*)(d),     xmm_def);
3706             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3707
3708             d += 32;
3709             w -= 32;
3710         }
3711
3712         if (w >= 16)
3713         {
3714             save_128_aligned ((__m128i*)(d),     xmm_def);
3715
3716             d += 16;
3717             w -= 16;
3718         }
3719
3720         while (w >= 4)
3721         {
3722             *(uint32_t *)d = data;
3723
3724             w -= 4;
3725             d += 4;
3726         }
3727
3728         if (w >= 2)
3729         {
3730             *(uint16_t *)d = data;
3731             w -= 2;
3732             d += 2;
3733         }
3734
3735         if (w >= 1)
3736         {
3737             *(uint8_t *)d = data;
3738             w -= 1;
3739             d += 1;
3740         }
3741     }
3742
3743     _mm_empty ();
3744     return TRUE;
3745 }
3746
3747 static void
3748 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3749                              pixman_op_t              op,
3750                              pixman_image_t *         src_image,
3751                              pixman_image_t *         mask_image,
3752                              pixman_image_t *         dst_image,
3753                              int32_t                  src_x,
3754                              int32_t                  src_y,
3755                              int32_t                  mask_x,
3756                              int32_t                  mask_y,
3757                              int32_t                  dest_x,
3758                              int32_t                  dest_y,
3759                              int32_t                  width,
3760                              int32_t                  height)
3761 {
3762     uint32_t src, srca;
3763     uint32_t    *dst_line, *dst;
3764     uint8_t     *mask_line, *mask;
3765     int dst_stride, mask_stride;
3766     int32_t w;
3767     uint32_t m;
3768
3769     __m128i xmm_src, xmm_def;
3770     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3771
3772     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3773
3774     srca = src >> 24;
3775     if (src == 0)
3776     {
3777         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3778                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
3779                           dest_x, dest_y, width, height, 0);
3780         return;
3781     }
3782
3783     PIXMAN_IMAGE_GET_LINE (
3784         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3785     PIXMAN_IMAGE_GET_LINE (
3786         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3787
3788     xmm_def = create_mask_2x32_128 (src, src);
3789     xmm_src = expand_pixel_32_1x128 (src);
3790
3791     while (height--)
3792     {
3793         dst = dst_line;
3794         dst_line += dst_stride;
3795         mask = mask_line;
3796         mask_line += mask_stride;
3797         w = width;
3798
3799         while (w && (unsigned long)dst & 15)
3800         {
3801             uint8_t m = *mask++;
3802
3803             if (m)
3804             {
3805                 *dst = pack_1x64_32 (
3806                     pix_multiply_1x64 (
3807                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3808             }
3809             else
3810             {
3811                 *dst = 0;
3812             }
3813
3814             w--;
3815             dst++;
3816         }
3817
3818         while (w >= 4)
3819         {
3820             m = *((uint32_t*)mask);
3821
3822             if (srca == 0xff && m == 0xffffffff)
3823             {
3824                 save_128_aligned ((__m128i*)dst, xmm_def);
3825             }
3826             else if (m)
3827             {
3828                 xmm_mask = unpack_32_1x128 (m);
3829                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3830
3831                 /* Unpacking */
3832                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3833
3834                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3835                                         &xmm_mask_lo, &xmm_mask_hi);
3836
3837                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3838                                     &xmm_mask_lo, &xmm_mask_hi,
3839                                     &xmm_mask_lo, &xmm_mask_hi);
3840
3841                 save_128_aligned (
3842                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3843             }
3844             else
3845             {
3846                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3847             }
3848
3849             w -= 4;
3850             dst += 4;
3851             mask += 4;
3852         }
3853
3854         while (w)
3855         {
3856             uint8_t m = *mask++;
3857
3858             if (m)
3859             {
3860                 *dst = pack_1x64_32 (
3861                     pix_multiply_1x64 (
3862                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3863             }
3864             else
3865             {
3866                 *dst = 0;
3867             }
3868
3869             w--;
3870             dst++;
3871         }
3872     }
3873
3874     _mm_empty ();
3875 }
3876
3877 /*-----------------------------------------------------------------------
3878  * composite_over_n_8_0565
3879  */
3880
3881 static void
3882 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3883                               pixman_op_t              op,
3884                               pixman_image_t *         src_image,
3885                               pixman_image_t *         mask_image,
3886                               pixman_image_t *         dst_image,
3887                               int32_t                  src_x,
3888                               int32_t                  src_y,
3889                               int32_t                  mask_x,
3890                               int32_t                  mask_y,
3891                               int32_t                  dest_x,
3892                               int32_t                  dest_y,
3893                               int32_t                  width,
3894                               int32_t                  height)
3895 {
3896     uint32_t src, srca;
3897     uint16_t    *dst_line, *dst, d;
3898     uint8_t     *mask_line, *mask;
3899     int dst_stride, mask_stride;
3900     int32_t w;
3901     uint32_t m;
3902     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3903
3904     __m128i xmm_src, xmm_alpha;
3905     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3906     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3907
3908     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3909
3910     srca = src >> 24;
3911     if (src == 0)
3912         return;
3913
3914     PIXMAN_IMAGE_GET_LINE (
3915         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3916     PIXMAN_IMAGE_GET_LINE (
3917         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3918
3919     xmm_src = expand_pixel_32_1x128 (src);
3920     xmm_alpha = expand_alpha_1x128 (xmm_src);
3921     mmx_src = _mm_movepi64_pi64 (xmm_src);
3922     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3923
3924     while (height--)
3925     {
3926         dst = dst_line;
3927         dst_line += dst_stride;
3928         mask = mask_line;
3929         mask_line += mask_stride;
3930         w = width;
3931
3932         while (w && (unsigned long)dst & 15)
3933         {
3934             m = *mask++;
3935
3936             if (m)
3937             {
3938                 d = *dst;
3939                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
3940                 mmx_dest = expand565_16_1x64 (d);
3941
3942                 *dst = pack_565_32_16 (
3943                     pack_1x64_32 (
3944                         in_over_1x64 (
3945                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3946             }
3947
3948             w--;
3949             dst++;
3950         }
3951
3952         while (w >= 8)
3953         {
3954             xmm_dst = load_128_aligned ((__m128i*) dst);
3955             unpack_565_128_4x128 (xmm_dst,
3956                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3957
3958             m = *((uint32_t*)mask);
3959             mask += 4;
3960
3961             if (m)
3962             {
3963                 xmm_mask = unpack_32_1x128 (m);
3964                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3965
3966                 /* Unpacking */
3967                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3968
3969                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3970                                         &xmm_mask_lo, &xmm_mask_hi);
3971
3972                 in_over_2x128 (&xmm_src, &xmm_src,
3973                                &xmm_alpha, &xmm_alpha,
3974                                &xmm_mask_lo, &xmm_mask_hi,
3975                                &xmm_dst0, &xmm_dst1);
3976             }
3977
3978             m = *((uint32_t*)mask);
3979             mask += 4;
3980
3981             if (m)
3982             {
3983                 xmm_mask = unpack_32_1x128 (m);
3984                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3985
3986                 /* Unpacking */
3987                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3988
3989                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3990                                         &xmm_mask_lo, &xmm_mask_hi);
3991                 in_over_2x128 (&xmm_src, &xmm_src,
3992                                &xmm_alpha, &xmm_alpha,
3993                                &xmm_mask_lo, &xmm_mask_hi,
3994                                &xmm_dst2, &xmm_dst3);
3995             }
3996
3997             save_128_aligned (
3998                 (__m128i*)dst, pack_565_4x128_128 (
3999                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4000
4001             w -= 8;
4002             dst += 8;
4003         }
4004
4005         while (w)
4006         {
4007             m = *mask++;
4008
4009             if (m)
4010             {
4011                 d = *dst;
4012                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4013                 mmx_dest = expand565_16_1x64 (d);
4014
4015                 *dst = pack_565_32_16 (
4016                     pack_1x64_32 (
4017                         in_over_1x64 (
4018                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4019             }
4020
4021             w--;
4022             dst++;
4023         }
4024     }
4025
4026     _mm_empty ();
4027 }
4028
4029 /* -----------------------------------------------------------------------
4030  * composite_over_pixbuf_0565
4031  */
4032
4033 static void
4034 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4035                                  pixman_op_t              op,
4036                                  pixman_image_t *         src_image,
4037                                  pixman_image_t *         mask_image,
4038                                  pixman_image_t *         dst_image,
4039                                  int32_t                  src_x,
4040                                  int32_t                  src_y,
4041                                  int32_t                  mask_x,
4042                                  int32_t                  mask_y,
4043                                  int32_t                  dest_x,
4044                                  int32_t                  dest_y,
4045                                  int32_t                  width,
4046                                  int32_t                  height)
4047 {
4048     uint16_t    *dst_line, *dst, d;
4049     uint32_t    *src_line, *src, s;
4050     int dst_stride, src_stride;
4051     int32_t w;
4052     uint32_t opaque, zero;
4053
4054     __m64 ms;
4055     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4056     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4057
4058     PIXMAN_IMAGE_GET_LINE (
4059         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4060     PIXMAN_IMAGE_GET_LINE (
4061         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4062
4063 #if 0
4064     /* FIXME
4065      *
4066      * I copy the code from MMX one and keep the fixme.
4067      * If it's a problem there, probably is a problem here.
4068      */
4069     assert (src_image->drawable == mask_image->drawable);
4070 #endif
4071
4072     while (height--)
4073     {
4074         dst = dst_line;
4075         dst_line += dst_stride;
4076         src = src_line;
4077         src_line += src_stride;
4078         w = width;
4079
4080         while (w && (unsigned long)dst & 15)
4081         {
4082             s = *src++;
4083             d = *dst;
4084
4085             ms = unpack_32_1x64 (s);
4086
4087             *dst++ = pack_565_32_16 (
4088                 pack_1x64_32 (
4089                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4090             w--;
4091         }
4092
4093         while (w >= 8)
4094         {
4095             /* First round */
4096             xmm_src = load_128_unaligned ((__m128i*)src);
4097             xmm_dst = load_128_aligned  ((__m128i*)dst);
4098
4099             opaque = is_opaque (xmm_src);
4100             zero = is_zero (xmm_src);
4101
4102             unpack_565_128_4x128 (xmm_dst,
4103                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4104             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4105
4106             /* preload next round*/
4107             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4108
4109             if (opaque)
4110             {
4111                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4112                                      &xmm_dst0, &xmm_dst1);
4113             }
4114             else if (!zero)
4115             {
4116                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4117                                         &xmm_dst0, &xmm_dst1);
4118             }
4119
4120             /* Second round */
4121             opaque = is_opaque (xmm_src);
4122             zero = is_zero (xmm_src);
4123
4124             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4125
4126             if (opaque)
4127             {
4128                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4129                                      &xmm_dst2, &xmm_dst3);
4130             }
4131             else if (!zero)
4132             {
4133                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4134                                         &xmm_dst2, &xmm_dst3);
4135             }
4136
4137             save_128_aligned (
4138                 (__m128i*)dst, pack_565_4x128_128 (
4139                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4140
4141             w -= 8;
4142             src += 8;
4143             dst += 8;
4144         }
4145
4146         while (w)
4147         {
4148             s = *src++;
4149             d = *dst;
4150
4151             ms = unpack_32_1x64 (s);
4152
4153             *dst++ = pack_565_32_16 (
4154                 pack_1x64_32 (
4155                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4156             w--;
4157         }
4158     }
4159
4160     _mm_empty ();
4161 }
4162
4163 /* -------------------------------------------------------------------------
4164  * composite_over_pixbuf_8888
4165  */
4166
4167 static void
4168 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4169                                  pixman_op_t              op,
4170                                  pixman_image_t *         src_image,
4171                                  pixman_image_t *         mask_image,
4172                                  pixman_image_t *         dst_image,
4173                                  int32_t                  src_x,
4174                                  int32_t                  src_y,
4175                                  int32_t                  mask_x,
4176                                  int32_t                  mask_y,
4177                                  int32_t                  dest_x,
4178                                  int32_t                  dest_y,
4179                                  int32_t                  width,
4180                                  int32_t                  height)
4181 {
4182     uint32_t    *dst_line, *dst, d;
4183     uint32_t    *src_line, *src, s;
4184     int dst_stride, src_stride;
4185     int32_t w;
4186     uint32_t opaque, zero;
4187
4188     __m128i xmm_src_lo, xmm_src_hi;
4189     __m128i xmm_dst_lo, xmm_dst_hi;
4190
4191     PIXMAN_IMAGE_GET_LINE (
4192         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4193     PIXMAN_IMAGE_GET_LINE (
4194         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4195
4196 #if 0
4197     /* FIXME
4198      *
4199      * I copy the code from MMX one and keep the fixme.
4200      * If it's a problem there, probably is a problem here.
4201      */
4202     assert (src_image->drawable == mask_image->drawable);
4203 #endif
4204
4205     while (height--)
4206     {
4207         dst = dst_line;
4208         dst_line += dst_stride;
4209         src = src_line;
4210         src_line += src_stride;
4211         w = width;
4212
4213         while (w && (unsigned long)dst & 15)
4214         {
4215             s = *src++;
4216             d = *dst;
4217
4218             *dst++ = pack_1x64_32 (
4219                 over_rev_non_pre_1x64 (
4220                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4221
4222             w--;
4223         }
4224
4225         while (w >= 4)
4226         {
4227             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4228
4229             opaque = is_opaque (xmm_src_hi);
4230             zero = is_zero (xmm_src_hi);
4231
4232             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4233
4234             if (opaque)
4235             {
4236                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4237                                      &xmm_dst_lo, &xmm_dst_hi);
4238
4239                 save_128_aligned (
4240                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4241             }
4242             else if (!zero)
4243             {
4244                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4245
4246                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4247
4248                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4249                                         &xmm_dst_lo, &xmm_dst_hi);
4250
4251                 save_128_aligned (
4252                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4253             }
4254
4255             w -= 4;
4256             dst += 4;
4257             src += 4;
4258         }
4259
4260         while (w)
4261         {
4262             s = *src++;
4263             d = *dst;
4264
4265             *dst++ = pack_1x64_32 (
4266                 over_rev_non_pre_1x64 (
4267                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4268
4269             w--;
4270         }
4271     }
4272
4273     _mm_empty ();
4274 }
4275
4276 /* -------------------------------------------------------------------------------------------------
4277  * composite_over_n_8888_0565_ca
4278  */
4279
4280 static void
4281 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4282                                     pixman_op_t              op,
4283                                     pixman_image_t *         src_image,
4284                                     pixman_image_t *         mask_image,
4285                                     pixman_image_t *         dst_image,
4286                                     int32_t                  src_x,
4287                                     int32_t                  src_y,
4288                                     int32_t                  mask_x,
4289                                     int32_t                  mask_y,
4290                                     int32_t                  dest_x,
4291                                     int32_t                  dest_y,
4292                                     int32_t                  width,
4293                                     int32_t                  height)
4294 {
4295     uint32_t src;
4296     uint16_t    *dst_line, *dst, d;
4297     uint32_t    *mask_line, *mask, m;
4298     int dst_stride, mask_stride;
4299     int w;
4300     uint32_t pack_cmp;
4301
4302     __m128i xmm_src, xmm_alpha;
4303     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4304     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4305
4306     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4307
4308     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4309
4310     if (src == 0)
4311         return;
4312
4313     PIXMAN_IMAGE_GET_LINE (
4314         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4315     PIXMAN_IMAGE_GET_LINE (
4316         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4317
4318     xmm_src = expand_pixel_32_1x128 (src);
4319     xmm_alpha = expand_alpha_1x128 (xmm_src);
4320     mmx_src = _mm_movepi64_pi64 (xmm_src);
4321     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4322
4323     while (height--)
4324     {
4325         w = width;
4326         mask = mask_line;
4327         dst = dst_line;
4328         mask_line += mask_stride;
4329         dst_line += dst_stride;
4330
4331         while (w && ((unsigned long)dst & 15))
4332         {
4333             m = *(uint32_t *) mask;
4334
4335             if (m)
4336             {
4337                 d = *dst;
4338                 mmx_mask = unpack_32_1x64 (m);
4339                 mmx_dest = expand565_16_1x64 (d);
4340
4341                 *dst = pack_565_32_16 (
4342                     pack_1x64_32 (
4343                         in_over_1x64 (
4344                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4345             }
4346
4347             w--;
4348             dst++;
4349             mask++;
4350         }
4351
4352         while (w >= 8)
4353         {
4354             /* First round */
4355             xmm_mask = load_128_unaligned ((__m128i*)mask);
4356             xmm_dst = load_128_aligned ((__m128i*)dst);
4357
4358             pack_cmp = _mm_movemask_epi8 (
4359                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4360
4361             unpack_565_128_4x128 (xmm_dst,
4362                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4363             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4364
4365             /* preload next round */
4366             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4367
4368             /* preload next round */
4369             if (pack_cmp != 0xffff)
4370             {
4371                 in_over_2x128 (&xmm_src, &xmm_src,
4372                                &xmm_alpha, &xmm_alpha,
4373                                &xmm_mask_lo, &xmm_mask_hi,
4374                                &xmm_dst0, &xmm_dst1);
4375             }
4376
4377             /* Second round */
4378             pack_cmp = _mm_movemask_epi8 (
4379                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4380
4381             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4382
4383             if (pack_cmp != 0xffff)
4384             {
4385                 in_over_2x128 (&xmm_src, &xmm_src,
4386                                &xmm_alpha, &xmm_alpha,
4387                                &xmm_mask_lo, &xmm_mask_hi,
4388                                &xmm_dst2, &xmm_dst3);
4389             }
4390
4391             save_128_aligned (
4392                 (__m128i*)dst, pack_565_4x128_128 (
4393                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4394
4395             w -= 8;
4396             dst += 8;
4397             mask += 8;
4398         }
4399
4400         while (w)
4401         {
4402             m = *(uint32_t *) mask;
4403
4404             if (m)
4405             {
4406                 d = *dst;
4407                 mmx_mask = unpack_32_1x64 (m);
4408                 mmx_dest = expand565_16_1x64 (d);
4409
4410                 *dst = pack_565_32_16 (
4411                     pack_1x64_32 (
4412                         in_over_1x64 (
4413                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4414             }
4415
4416             w--;
4417             dst++;
4418             mask++;
4419         }
4420     }
4421
4422     _mm_empty ();
4423 }
4424
4425 /* -----------------------------------------------------------------------
4426  * composite_in_n_8_8
4427  */
4428
4429 static void
4430 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4431                          pixman_op_t              op,
4432                          pixman_image_t *         src_image,
4433                          pixman_image_t *         mask_image,
4434                          pixman_image_t *         dst_image,
4435                          int32_t                  src_x,
4436                          int32_t                  src_y,
4437                          int32_t                  mask_x,
4438                          int32_t                  mask_y,
4439                          int32_t                  dest_x,
4440                          int32_t                  dest_y,
4441                          int32_t                  width,
4442                          int32_t                  height)
4443 {
4444     uint8_t     *dst_line, *dst;
4445     uint8_t     *mask_line, *mask;
4446     int dst_stride, mask_stride;
4447     uint32_t d, m;
4448     uint32_t src;
4449     uint8_t sa;
4450     int32_t w;
4451
4452     __m128i xmm_alpha;
4453     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4454     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4455
4456     PIXMAN_IMAGE_GET_LINE (
4457         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4458     PIXMAN_IMAGE_GET_LINE (
4459         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4460
4461     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4462
4463     sa = src >> 24;
4464
4465     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4466
4467     while (height--)
4468     {
4469         dst = dst_line;
4470         dst_line += dst_stride;
4471         mask = mask_line;
4472         mask_line += mask_stride;
4473         w = width;
4474
4475         while (w && ((unsigned long)dst & 15))
4476         {
4477             m = (uint32_t) *mask++;
4478             d = (uint32_t) *dst;
4479
4480             *dst++ = (uint8_t) pack_1x64_32 (
4481                 pix_multiply_1x64 (
4482                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4483                                        unpack_32_1x64 (m)),
4484                     unpack_32_1x64 (d)));
4485             w--;
4486         }
4487
4488         while (w >= 16)
4489         {
4490             xmm_mask = load_128_unaligned ((__m128i*)mask);
4491             xmm_dst = load_128_aligned ((__m128i*)dst);
4492
4493             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4494             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4495
4496             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4497                                 &xmm_mask_lo, &xmm_mask_hi,
4498                                 &xmm_mask_lo, &xmm_mask_hi);
4499
4500             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4501                                 &xmm_dst_lo, &xmm_dst_hi,
4502                                 &xmm_dst_lo, &xmm_dst_hi);
4503
4504             save_128_aligned (
4505                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4506
4507             mask += 16;
4508             dst += 16;
4509             w -= 16;
4510         }
4511
4512         while (w)
4513         {
4514             m = (uint32_t) *mask++;
4515             d = (uint32_t) *dst;
4516
4517             *dst++ = (uint8_t) pack_1x64_32 (
4518                 pix_multiply_1x64 (
4519                     pix_multiply_1x64 (
4520                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4521                     unpack_32_1x64 (d)));
4522             w--;
4523         }
4524     }
4525
4526     _mm_empty ();
4527 }
4528
4529 /* -----------------------------------------------------------------------
4530  * composite_in_n_8
4531  */
4532
4533 static void
4534 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4535                        pixman_op_t              op,
4536                        pixman_image_t *         src_image,
4537                        pixman_image_t *         mask_image,
4538                        pixman_image_t *         dst_image,
4539                        int32_t                  src_x,
4540                        int32_t                  src_y,
4541                        int32_t                  mask_x,
4542                        int32_t                  mask_y,
4543                        int32_t                  dest_x,
4544                        int32_t                  dest_y,
4545                        int32_t                  width,
4546                        int32_t                  height)
4547 {
4548     uint8_t     *dst_line, *dst;
4549     int dst_stride;
4550     uint32_t d;
4551     uint32_t src;
4552     int32_t w;
4553
4554     __m128i xmm_alpha;
4555     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4556
4557     PIXMAN_IMAGE_GET_LINE (
4558         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4559
4560     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4561
4562     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4563
4564     src = src >> 24;
4565
4566     if (src == 0xff)
4567         return;
4568
4569     if (src == 0x00)
4570     {
4571         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4572                      8, dest_x, dest_y, width, height, src);
4573
4574         return;
4575     }
4576
4577     while (height--)
4578     {
4579         dst = dst_line;
4580         dst_line += dst_stride;
4581         w = width;
4582
4583         while (w && ((unsigned long)dst & 15))
4584         {
4585             d = (uint32_t) *dst;
4586
4587             *dst++ = (uint8_t) pack_1x64_32 (
4588                 pix_multiply_1x64 (
4589                     _mm_movepi64_pi64 (xmm_alpha),
4590                     unpack_32_1x64 (d)));
4591             w--;
4592         }
4593
4594         while (w >= 16)
4595         {
4596             xmm_dst = load_128_aligned ((__m128i*)dst);
4597
4598             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4599
4600             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4601                                 &xmm_dst_lo, &xmm_dst_hi,
4602                                 &xmm_dst_lo, &xmm_dst_hi);
4603
4604             save_128_aligned (
4605                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4606
4607             dst += 16;
4608             w -= 16;
4609         }
4610
4611         while (w)
4612         {
4613             d = (uint32_t) *dst;
4614
4615             *dst++ = (uint8_t) pack_1x64_32 (
4616                 pix_multiply_1x64 (
4617                     _mm_movepi64_pi64 (xmm_alpha),
4618                     unpack_32_1x64 (d)));
4619             w--;
4620         }
4621     }
4622
4623     _mm_empty ();
4624 }
4625
4626 /* ---------------------------------------------------------------------------
4627  * composite_in_8_8
4628  */
4629
4630 static void
4631 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4632                        pixman_op_t              op,
4633                        pixman_image_t *         src_image,
4634                        pixman_image_t *         mask_image,
4635                        pixman_image_t *         dst_image,
4636                        int32_t                  src_x,
4637                        int32_t                  src_y,
4638                        int32_t                  mask_x,
4639                        int32_t                  mask_y,
4640                        int32_t                  dest_x,
4641                        int32_t                  dest_y,
4642                        int32_t                  width,
4643                        int32_t                  height)
4644 {
4645     uint8_t     *dst_line, *dst;
4646     uint8_t     *src_line, *src;
4647     int src_stride, dst_stride;
4648     int32_t w;
4649     uint32_t s, d;
4650
4651     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4652     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4653
4654     PIXMAN_IMAGE_GET_LINE (
4655         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4656     PIXMAN_IMAGE_GET_LINE (
4657         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4658
4659     while (height--)
4660     {
4661         dst = dst_line;
4662         dst_line += dst_stride;
4663         src = src_line;
4664         src_line += src_stride;
4665         w = width;
4666
4667         while (w && ((unsigned long)dst & 15))
4668         {
4669             s = (uint32_t) *src++;
4670             d = (uint32_t) *dst;
4671
4672             *dst++ = (uint8_t) pack_1x64_32 (
4673                 pix_multiply_1x64 (
4674                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4675             w--;
4676         }
4677
4678         while (w >= 16)
4679         {
4680             xmm_src = load_128_unaligned ((__m128i*)src);
4681             xmm_dst = load_128_aligned ((__m128i*)dst);
4682
4683             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4684             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4685
4686             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4687                                 &xmm_dst_lo, &xmm_dst_hi,
4688                                 &xmm_dst_lo, &xmm_dst_hi);
4689
4690             save_128_aligned (
4691                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4692
4693             src += 16;
4694             dst += 16;
4695             w -= 16;
4696         }
4697
4698         while (w)
4699         {
4700             s = (uint32_t) *src++;
4701             d = (uint32_t) *dst;
4702
4703             *dst++ = (uint8_t) pack_1x64_32 (
4704                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4705             w--;
4706         }
4707     }
4708
4709     _mm_empty ();
4710 }
4711
4712 /* -------------------------------------------------------------------------
4713  * composite_add_n_8_8
4714  */
4715
4716 static void
4717 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4718                           pixman_op_t              op,
4719                           pixman_image_t *         src_image,
4720                           pixman_image_t *         mask_image,
4721                           pixman_image_t *         dst_image,
4722                           int32_t                  src_x,
4723                           int32_t                  src_y,
4724                           int32_t                  mask_x,
4725                           int32_t                  mask_y,
4726                           int32_t                  dest_x,
4727                           int32_t                  dest_y,
4728                           int32_t                  width,
4729                           int32_t                  height)
4730 {
4731     uint8_t     *dst_line, *dst;
4732     uint8_t     *mask_line, *mask;
4733     int dst_stride, mask_stride;
4734     int32_t w;
4735     uint32_t src;
4736     uint8_t sa;
4737     uint32_t m, d;
4738
4739     __m128i xmm_alpha;
4740     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4741     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4742
4743     PIXMAN_IMAGE_GET_LINE (
4744         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4745     PIXMAN_IMAGE_GET_LINE (
4746         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4747
4748     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4749
4750     sa = src >> 24;
4751
4752     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4753
4754     while (height--)
4755     {
4756         dst = dst_line;
4757         dst_line += dst_stride;
4758         mask = mask_line;
4759         mask_line += mask_stride;
4760         w = width;
4761
4762         while (w && ((unsigned long)dst & 15))
4763         {
4764             m = (uint32_t) *mask++;
4765             d = (uint32_t) *dst;
4766
4767             *dst++ = (uint8_t) pack_1x64_32 (
4768                 _mm_adds_pu16 (
4769                     pix_multiply_1x64 (
4770                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4771                     unpack_32_1x64 (d)));
4772             w--;
4773         }
4774
4775         while (w >= 16)
4776         {
4777             xmm_mask = load_128_unaligned ((__m128i*)mask);
4778             xmm_dst = load_128_aligned ((__m128i*)dst);
4779
4780             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4781             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4782
4783             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4784                                 &xmm_mask_lo, &xmm_mask_hi,
4785                                 &xmm_mask_lo, &xmm_mask_hi);
4786
4787             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4788             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4789
4790             save_128_aligned (
4791                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4792
4793             mask += 16;
4794             dst += 16;
4795             w -= 16;
4796         }
4797
4798         while (w)
4799         {
4800             m = (uint32_t) *mask++;
4801             d = (uint32_t) *dst;
4802
4803             *dst++ = (uint8_t) pack_1x64_32 (
4804                 _mm_adds_pu16 (
4805                     pix_multiply_1x64 (
4806                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4807                     unpack_32_1x64 (d)));
4808
4809             w--;
4810         }
4811     }
4812
4813     _mm_empty ();
4814 }
4815
4816 /* -------------------------------------------------------------------------
4817  * composite_add_n_8_8
4818  */
4819
4820 static void
4821 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4822                         pixman_op_t              op,
4823                         pixman_image_t *         src_image,
4824                         pixman_image_t *         mask_image,
4825                         pixman_image_t *         dst_image,
4826                         int32_t                  src_x,
4827                         int32_t                  src_y,
4828                         int32_t                  mask_x,
4829                         int32_t                  mask_y,
4830                         int32_t                  dest_x,
4831                         int32_t                  dest_y,
4832                         int32_t                  width,
4833                         int32_t                  height)
4834 {
4835     uint8_t     *dst_line, *dst;
4836     int dst_stride;
4837     int32_t w;
4838     uint32_t src;
4839
4840     __m128i xmm_src;
4841
4842     PIXMAN_IMAGE_GET_LINE (
4843         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4844
4845     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4846
4847     src >>= 24;
4848
4849     if (src == 0x00)
4850         return;
4851
4852     if (src == 0xff)
4853     {
4854         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4855                      8, dest_x, dest_y, width, height, 0xff);
4856
4857         return;
4858     }
4859
4860     src = (src << 24) | (src << 16) | (src << 8) | src;
4861     xmm_src = _mm_set_epi32 (src, src, src, src);
4862
4863     while (height--)
4864     {
4865         dst = dst_line;
4866         dst_line += dst_stride;
4867         w = width;
4868
4869         while (w && ((unsigned long)dst & 15))
4870         {
4871             *dst = (uint8_t)_mm_cvtsi64_si32 (
4872                 _mm_adds_pu8 (
4873                     _mm_movepi64_pi64 (xmm_src),
4874                     _mm_cvtsi32_si64 (*dst)));
4875
4876             w--;
4877             dst++;
4878         }
4879
4880         while (w >= 16)
4881         {
4882             save_128_aligned (
4883                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4884
4885             dst += 16;
4886             w -= 16;
4887         }
4888
4889         while (w)
4890         {
4891             *dst = (uint8_t)_mm_cvtsi64_si32 (
4892                 _mm_adds_pu8 (
4893                     _mm_movepi64_pi64 (xmm_src),
4894                     _mm_cvtsi32_si64 (*dst)));
4895
4896             w--;
4897             dst++;
4898         }
4899     }
4900
4901     _mm_empty ();
4902 }
4903
4904 /* ----------------------------------------------------------------------
4905  * composite_add_8000_8000
4906  */
4907
4908 static void
4909 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
4910                               pixman_op_t              op,
4911                               pixman_image_t *         src_image,
4912                               pixman_image_t *         mask_image,
4913                               pixman_image_t *         dst_image,
4914                               int32_t                  src_x,
4915                               int32_t                  src_y,
4916                               int32_t                  mask_x,
4917                               int32_t                  mask_y,
4918                               int32_t                  dest_x,
4919                               int32_t                  dest_y,
4920                               int32_t                  width,
4921                               int32_t                  height)
4922 {
4923     uint8_t     *dst_line, *dst;
4924     uint8_t     *src_line, *src;
4925     int dst_stride, src_stride;
4926     int32_t w;
4927     uint16_t t;
4928
4929     PIXMAN_IMAGE_GET_LINE (
4930         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4931     PIXMAN_IMAGE_GET_LINE (
4932         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4933
4934     while (height--)
4935     {
4936         dst = dst_line;
4937         src = src_line;
4938
4939         dst_line += dst_stride;
4940         src_line += src_stride;
4941         w = width;
4942
4943         /* Small head */
4944         while (w && (unsigned long)dst & 3)
4945         {
4946             t = (*dst) + (*src++);
4947             *dst++ = t | (0 - (t >> 8));
4948             w--;
4949         }
4950
4951         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4952
4953         /* Small tail */
4954         dst += w & 0xfffc;
4955         src += w & 0xfffc;
4956
4957         w &= 3;
4958
4959         while (w)
4960         {
4961             t = (*dst) + (*src++);
4962             *dst++ = t | (0 - (t >> 8));
4963             w--;
4964         }
4965     }
4966
4967     _mm_empty ();
4968 }
4969
4970 /* ---------------------------------------------------------------------
4971  * composite_add_8888_8888
4972  */
4973 static void
4974 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4975                               pixman_op_t              op,
4976                               pixman_image_t *         src_image,
4977                               pixman_image_t *         mask_image,
4978                               pixman_image_t *         dst_image,
4979                               int32_t                  src_x,
4980                               int32_t                  src_y,
4981                               int32_t                  mask_x,
4982                               int32_t                  mask_y,
4983                               int32_t                  dest_x,
4984                               int32_t                  dest_y,
4985                               int32_t                  width,
4986                               int32_t                  height)
4987 {
4988     uint32_t    *dst_line, *dst;
4989     uint32_t    *src_line, *src;
4990     int dst_stride, src_stride;
4991
4992     PIXMAN_IMAGE_GET_LINE (
4993         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4994     PIXMAN_IMAGE_GET_LINE (
4995         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4996
4997     while (height--)
4998     {
4999         dst = dst_line;
5000         dst_line += dst_stride;
5001         src = src_line;
5002         src_line += src_stride;
5003
5004         core_combine_add_u_sse2 (dst, src, NULL, width);
5005     }
5006
5007     _mm_empty ();
5008 }
5009
5010 /* -------------------------------------------------------------------------------------------------
5011  * sse2_composite_copy_area
5012  */
5013
5014 static pixman_bool_t
5015 pixman_blt_sse2 (uint32_t *src_bits,
5016                  uint32_t *dst_bits,
5017                  int       src_stride,
5018                  int       dst_stride,
5019                  int       src_bpp,
5020                  int       dst_bpp,
5021                  int       src_x,
5022                  int       src_y,
5023                  int       dst_x,
5024                  int       dst_y,
5025                  int       width,
5026                  int       height)
5027 {
5028     uint8_t *   src_bytes;
5029     uint8_t *   dst_bytes;
5030     int byte_width;
5031
5032     if (src_bpp != dst_bpp)
5033         return FALSE;
5034
5035     if (src_bpp == 16)
5036     {
5037         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5038         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5039         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5040         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5041         byte_width = 2 * width;
5042         src_stride *= 2;
5043         dst_stride *= 2;
5044     }
5045     else if (src_bpp == 32)
5046     {
5047         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5048         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5049         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5050         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5051         byte_width = 4 * width;
5052         src_stride *= 4;
5053         dst_stride *= 4;
5054     }
5055     else
5056     {
5057         return FALSE;
5058     }
5059
5060     while (height--)
5061     {
5062         int w;
5063         uint8_t *s = src_bytes;
5064         uint8_t *d = dst_bytes;
5065         src_bytes += src_stride;
5066         dst_bytes += dst_stride;
5067         w = byte_width;
5068
5069         while (w >= 2 && ((unsigned long)d & 3))
5070         {
5071             *(uint16_t *)d = *(uint16_t *)s;
5072             w -= 2;
5073             s += 2;
5074             d += 2;
5075         }
5076
5077         while (w >= 4 && ((unsigned long)d & 15))
5078         {
5079             *(uint32_t *)d = *(uint32_t *)s;
5080
5081             w -= 4;
5082             s += 4;
5083             d += 4;
5084         }
5085
5086         while (w >= 64)
5087         {
5088             __m128i xmm0, xmm1, xmm2, xmm3;
5089
5090             xmm0 = load_128_unaligned ((__m128i*)(s));
5091             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5092             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5093             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5094
5095             save_128_aligned ((__m128i*)(d),    xmm0);
5096             save_128_aligned ((__m128i*)(d + 16), xmm1);
5097             save_128_aligned ((__m128i*)(d + 32), xmm2);
5098             save_128_aligned ((__m128i*)(d + 48), xmm3);
5099
5100             s += 64;
5101             d += 64;
5102             w -= 64;
5103         }
5104
5105         while (w >= 16)
5106         {
5107             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5108
5109             w -= 16;
5110             d += 16;
5111             s += 16;
5112         }
5113
5114         while (w >= 4)
5115         {
5116             *(uint32_t *)d = *(uint32_t *)s;
5117
5118             w -= 4;
5119             s += 4;
5120             d += 4;
5121         }
5122
5123         if (w >= 2)
5124         {
5125             *(uint16_t *)d = *(uint16_t *)s;
5126             w -= 2;
5127             s += 2;
5128             d += 2;
5129         }
5130     }
5131
5132     _mm_empty ();
5133
5134     return TRUE;
5135 }
5136
5137 static void
5138 sse2_composite_copy_area (pixman_implementation_t *imp,
5139                           pixman_op_t              op,
5140                           pixman_image_t *         src_image,
5141                           pixman_image_t *         mask_image,
5142                           pixman_image_t *         dst_image,
5143                           int32_t                  src_x,
5144                           int32_t                  src_y,
5145                           int32_t                  mask_x,
5146                           int32_t                  mask_y,
5147                           int32_t                  dest_x,
5148                           int32_t                  dest_y,
5149                           int32_t                  width,
5150                           int32_t                  height)
5151 {
5152     pixman_blt_sse2 (src_image->bits.bits,
5153                      dst_image->bits.bits,
5154                      src_image->bits.rowstride,
5155                      dst_image->bits.rowstride,
5156                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5157                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5158                      src_x, src_y, dest_x, dest_y, width, height);
5159 }
5160
5161 static void
5162 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5163                                  pixman_op_t              op,
5164                                  pixman_image_t *         src_image,
5165                                  pixman_image_t *         mask_image,
5166                                  pixman_image_t *         dst_image,
5167                                  int32_t                  src_x,
5168                                  int32_t                  src_y,
5169                                  int32_t                  mask_x,
5170                                  int32_t                  mask_y,
5171                                  int32_t                  dest_x,
5172                                  int32_t                  dest_y,
5173                                  int32_t                  width,
5174                                  int32_t                  height)
5175 {
5176     uint32_t    *src, *src_line, s;
5177     uint32_t    *dst, *dst_line, d;
5178     uint8_t         *mask, *mask_line;
5179     uint32_t m;
5180     int src_stride, mask_stride, dst_stride;
5181     int32_t w;
5182     __m64 ms;
5183
5184     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5185     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5186     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5187
5188     PIXMAN_IMAGE_GET_LINE (
5189         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5190     PIXMAN_IMAGE_GET_LINE (
5191         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5192     PIXMAN_IMAGE_GET_LINE (
5193         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5194
5195     while (height--)
5196     {
5197         src = src_line;
5198         src_line += src_stride;
5199         dst = dst_line;
5200         dst_line += dst_stride;
5201         mask = mask_line;
5202         mask_line += mask_stride;
5203
5204         w = width;
5205
5206         while (w && (unsigned long)dst & 15)
5207         {
5208             s = 0xff000000 | *src++;
5209             m = (uint32_t) *mask++;
5210             d = *dst;
5211             ms = unpack_32_1x64 (s);
5212
5213             if (m != 0xff)
5214             {
5215                 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5216                 __m64 md = unpack_32_1x64 (d);
5217
5218                 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5219             }
5220
5221             *dst++ = pack_1x64_32 (ms);
5222             w--;
5223         }
5224
5225         while (w >= 4)
5226         {
5227             m = *(uint32_t*) mask;
5228             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5229
5230             if (m == 0xffffffff)
5231             {
5232                 save_128_aligned ((__m128i*)dst, xmm_src);
5233             }
5234             else
5235             {
5236                 xmm_dst = load_128_aligned ((__m128i*)dst);
5237
5238                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5239
5240                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5241                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5242                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5243
5244                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5245
5246                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5247
5248                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5249             }
5250
5251             src += 4;
5252             dst += 4;
5253             mask += 4;
5254             w -= 4;
5255         }
5256
5257         while (w)
5258         {
5259             m = (uint32_t) *mask++;
5260
5261             if (m)
5262             {
5263                 s = 0xff000000 | *src;
5264
5265                 if (m == 0xff)
5266                 {
5267                     *dst = s;
5268                 }
5269                 else
5270                 {
5271                     __m64 ma, md, ms;
5272
5273                     d = *dst;
5274
5275                     ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5276                     md = unpack_32_1x64 (d);
5277                     ms = unpack_32_1x64 (s);
5278
5279                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5280                 }
5281
5282             }
5283
5284             src++;
5285             dst++;
5286             w--;
5287         }
5288     }
5289
5290     _mm_empty ();
5291 }
5292
5293 static void
5294 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5295                                  pixman_op_t              op,
5296                                  pixman_image_t *         src_image,
5297                                  pixman_image_t *         mask_image,
5298                                  pixman_image_t *         dst_image,
5299                                  int32_t                  src_x,
5300                                  int32_t                  src_y,
5301                                  int32_t                  mask_x,
5302                                  int32_t                  mask_y,
5303                                  int32_t                  dest_x,
5304                                  int32_t                  dest_y,
5305                                  int32_t                  width,
5306                                  int32_t                  height)
5307 {
5308     uint32_t    *src, *src_line, s;
5309     uint32_t    *dst, *dst_line, d;
5310     uint8_t         *mask, *mask_line;
5311     uint32_t m;
5312     int src_stride, mask_stride, dst_stride;
5313     int32_t w;
5314
5315     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5316     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5317     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5318
5319     PIXMAN_IMAGE_GET_LINE (
5320         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5321     PIXMAN_IMAGE_GET_LINE (
5322         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5323     PIXMAN_IMAGE_GET_LINE (
5324         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5325
5326     while (height--)
5327     {
5328         src = src_line;
5329         src_line += src_stride;
5330         dst = dst_line;
5331         dst_line += dst_stride;
5332         mask = mask_line;
5333         mask_line += mask_stride;
5334
5335         w = width;
5336
5337         while (w && (unsigned long)dst & 15)
5338         {
5339             uint32_t sa;
5340
5341             s = *src++;
5342             m = (uint32_t) *mask++;
5343             d = *dst;
5344
5345             sa = s >> 24;
5346
5347             if (m)
5348             {
5349                 if (sa == 0xff && m == 0xff)
5350                 {
5351                     *dst = s;
5352                 }
5353                 else
5354                 {
5355                     __m64 ms, md, ma, msa;
5356
5357                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5358                     ms = unpack_32_1x64 (s);
5359                     md = unpack_32_1x64 (d);
5360
5361                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5362
5363                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5364                 }
5365             }
5366
5367             dst++;
5368             w--;
5369         }
5370
5371         while (w >= 4)
5372         {
5373             m = *(uint32_t *) mask;
5374
5375             if (m)
5376             {
5377                 xmm_src = load_128_unaligned ((__m128i*)src);
5378
5379                 if (m == 0xffffffff && is_opaque (xmm_src))
5380                 {
5381                     save_128_aligned ((__m128i *)dst, xmm_src);
5382                 }
5383                 else
5384                 {
5385                     xmm_dst = load_128_aligned ((__m128i *)dst);
5386
5387                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5388
5389                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5390                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5391                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5392
5393                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5394                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5395
5396                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5397                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5398
5399                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5400                 }
5401             }
5402
5403             src += 4;
5404             dst += 4;
5405             mask += 4;
5406             w -= 4;
5407         }
5408
5409         while (w)
5410         {
5411             uint32_t sa;
5412
5413             s = *src++;
5414             m = (uint32_t) *mask++;
5415             d = *dst;
5416
5417             sa = s >> 24;
5418
5419             if (m)
5420             {
5421                 if (sa == 0xff && m == 0xff)
5422                 {
5423                     *dst = s;
5424                 }
5425                 else
5426                 {
5427                     __m64 ms, md, ma, msa;
5428
5429                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5430                     ms = unpack_32_1x64 (s);
5431                     md = unpack_32_1x64 (d);
5432
5433                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5434
5435                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5436                 }
5437             }
5438
5439             dst++;
5440             w--;
5441         }
5442     }
5443
5444     _mm_empty ();
5445 }
5446
5447 static void
5448 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5449                                     pixman_op_t              op,
5450                                     pixman_image_t *         src_image,
5451                                     pixman_image_t *         mask_image,
5452                                     pixman_image_t *         dst_image,
5453                                     int32_t                  src_x,
5454                                     int32_t                  src_y,
5455                                     int32_t                  mask_x,
5456                                     int32_t                  mask_y,
5457                                     int32_t                  dest_x,
5458                                     int32_t                  dest_y,
5459                                     int32_t                  width,
5460                                     int32_t                  height)
5461 {
5462     uint32_t src;
5463     uint32_t    *dst_line, *dst;
5464     __m128i xmm_src;
5465     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5466     __m128i xmm_dsta_hi, xmm_dsta_lo;
5467     int dst_stride;
5468     int32_t w;
5469
5470     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5471
5472     if (src == 0)
5473         return;
5474
5475     PIXMAN_IMAGE_GET_LINE (
5476         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5477
5478     xmm_src = expand_pixel_32_1x128 (src);
5479
5480     while (height--)
5481     {
5482         dst = dst_line;
5483
5484         dst_line += dst_stride;
5485         w = width;
5486
5487         while (w && (unsigned long)dst & 15)
5488         {
5489             __m64 vd;
5490
5491             vd = unpack_32_1x64 (*dst);
5492
5493             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5494                                             _mm_movepi64_pi64 (xmm_src)));
5495             w--;
5496             dst++;
5497         }
5498
5499         while (w >= 4)
5500         {
5501             __m128i tmp_lo, tmp_hi;
5502
5503             xmm_dst = load_128_aligned ((__m128i*)dst);
5504
5505             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5506             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5507
5508             tmp_lo = xmm_src;
5509             tmp_hi = xmm_src;
5510
5511             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5512                         &xmm_dsta_lo, &xmm_dsta_hi,
5513                         &tmp_lo, &tmp_hi);
5514
5515             save_128_aligned (
5516                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5517
5518             w -= 4;
5519             dst += 4;
5520         }
5521
5522         while (w)
5523         {
5524             __m64 vd;
5525
5526             vd = unpack_32_1x64 (*dst);
5527
5528             *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5529                                             _mm_movepi64_pi64 (xmm_src)));
5530             w--;
5531             dst++;
5532         }
5533
5534     }
5535
5536     _mm_empty ();
5537 }
5538
5539 static void
5540 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5541                                     pixman_op_t              op,
5542                                     pixman_image_t *         src_image,
5543                                     pixman_image_t *         mask_image,
5544                                     pixman_image_t *         dst_image,
5545                                     int32_t                  src_x,
5546                                     int32_t                  src_y,
5547                                     int32_t                  mask_x,
5548                                     int32_t                  mask_y,
5549                                     int32_t                  dest_x,
5550                                     int32_t                  dest_y,
5551                                     int32_t                  width,
5552                                     int32_t                  height)
5553 {
5554     uint32_t    *src, *src_line, s;
5555     uint32_t    *dst, *dst_line, d;
5556     uint32_t    *mask, *mask_line;
5557     uint32_t    m;
5558     int src_stride, mask_stride, dst_stride;
5559     int32_t w;
5560
5561     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5562     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5563     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5564
5565     PIXMAN_IMAGE_GET_LINE (
5566         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5567     PIXMAN_IMAGE_GET_LINE (
5568         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5569     PIXMAN_IMAGE_GET_LINE (
5570         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5571
5572     while (height--)
5573     {
5574         src = src_line;
5575         src_line += src_stride;
5576         dst = dst_line;
5577         dst_line += dst_stride;
5578         mask = mask_line;
5579         mask_line += mask_stride;
5580
5581         w = width;
5582
5583         while (w && (unsigned long)dst & 15)
5584         {
5585             uint32_t sa;
5586
5587             s = *src++;
5588             m = (*mask++) >> 24;
5589             d = *dst;
5590
5591             sa = s >> 24;
5592
5593             if (m)
5594             {
5595                 if (sa == 0xff && m == 0xff)
5596                 {
5597                     *dst = s;
5598                 }
5599                 else
5600                 {
5601                     __m64 ms, md, ma, msa;
5602
5603                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5604                     ms = unpack_32_1x64 (s);
5605                     md = unpack_32_1x64 (d);
5606
5607                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5608
5609                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5610                 }
5611             }
5612
5613             dst++;
5614             w--;
5615         }
5616
5617         while (w >= 4)
5618         {
5619             xmm_mask = load_128_unaligned ((__m128i*)mask);
5620
5621             if (!is_transparent (xmm_mask))
5622             {
5623                 xmm_src = load_128_unaligned ((__m128i*)src);
5624
5625                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5626                 {
5627                     save_128_aligned ((__m128i *)dst, xmm_src);
5628                 }
5629                 else
5630                 {
5631                     xmm_dst = load_128_aligned ((__m128i *)dst);
5632
5633                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5634                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5635                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5636
5637                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5638                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5639
5640                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5641                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5642
5643                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5644                 }
5645             }
5646
5647             src += 4;
5648             dst += 4;
5649             mask += 4;
5650             w -= 4;
5651         }
5652
5653         while (w)
5654         {
5655             uint32_t sa;
5656
5657             s = *src++;
5658             m = (*mask++) >> 24;
5659             d = *dst;
5660
5661             sa = s >> 24;
5662
5663             if (m)
5664             {
5665                 if (sa == 0xff && m == 0xff)
5666                 {
5667                     *dst = s;
5668                 }
5669                 else
5670                 {
5671                     __m64 ms, md, ma, msa;
5672
5673                     ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5674                     ms = unpack_32_1x64 (s);
5675                     md = unpack_32_1x64 (d);
5676
5677                     msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5678
5679                     *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5680                 }
5681             }
5682
5683             dst++;
5684             w--;
5685         }
5686     }
5687
5688     _mm_empty ();
5689 }
5690
5691 /* A variant of 'core_combine_over_u_sse2' with minor tweaks */
5692 static force_inline void
5693 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5694                                              const uint32_t* ps,
5695                                              int32_t         w,
5696                                              pixman_fixed_t  vx,
5697                                              pixman_fixed_t  unit_x,
5698                                              pixman_fixed_t  max_vx)
5699 {
5700     uint32_t s, d;
5701     const uint32_t* pm = NULL;
5702
5703     __m128i xmm_dst_lo, xmm_dst_hi;
5704     __m128i xmm_src_lo, xmm_src_hi;
5705     __m128i xmm_alpha_lo, xmm_alpha_hi;
5706
5707     /* Align dst on a 16-byte boundary */
5708     while (w && ((unsigned long)pd & 15))
5709     {
5710         d = *pd;
5711         s = combine1 (ps + (vx >> 16), pm);
5712         vx += unit_x;
5713
5714         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5715         if (pm)
5716             pm++;
5717         w--;
5718     }
5719
5720     while (w >= 4)
5721     {
5722         __m128i tmp;
5723         uint32_t tmp1, tmp2, tmp3, tmp4;
5724
5725         tmp1 = ps[vx >> 16];
5726         vx += unit_x;
5727         tmp2 = ps[vx >> 16];
5728         vx += unit_x;
5729         tmp3 = ps[vx >> 16];
5730         vx += unit_x;
5731         tmp4 = ps[vx >> 16];
5732         vx += unit_x;
5733
5734         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5735
5736         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5737
5738         if (is_opaque (xmm_src_hi))
5739         {
5740             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5741         }
5742         else if (!is_zero (xmm_src_hi))
5743         {
5744             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5745
5746             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5747             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5748
5749             expand_alpha_2x128 (
5750                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5751
5752             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5753                         &xmm_alpha_lo, &xmm_alpha_hi,
5754                         &xmm_dst_lo, &xmm_dst_hi);
5755
5756             /* rebuid the 4 pixel data and save*/
5757             save_128_aligned ((__m128i*)pd,
5758                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5759         }
5760
5761         w -= 4;
5762         pd += 4;
5763         if (pm)
5764             pm += 4;
5765     }
5766
5767     while (w)
5768     {
5769         d = *pd;
5770         s = combine1 (ps + (vx >> 16), pm);
5771         vx += unit_x;
5772
5773         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5774         if (pm)
5775             pm++;
5776
5777         w--;
5778     }
5779     _mm_empty ();
5780 }
5781
5782 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5783                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5784                        uint32_t, uint32_t, COVER);
5785 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5786                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5787                        uint32_t, uint32_t, NONE);
5788 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5789                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5790                        uint32_t, uint32_t, PAD);
5791
5792 static const pixman_fast_path_t sse2_fast_paths[] =
5793 {
5794     /* PIXMAN_OP_OVER */
5795     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5796     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5797     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5798     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5799     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5800     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5801     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5802     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5803     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5804     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5805     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5806     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5807     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5808     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5809     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5810     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5811     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5812     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5813     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5814     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5815     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5816     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5817     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5818     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5819     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5820     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5821     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5822     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5823     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5824     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5825     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5826     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5827     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5828     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5829     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5830     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5831     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5832     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5833     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5834     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5835     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5836     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5837     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5838     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5839     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5840     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5841
5842     /* PIXMAN_OP_OVER_REVERSE */
5843     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5844     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5845
5846     /* PIXMAN_OP_ADD */
5847     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5848     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
5849     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5850     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5851     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5852     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5853
5854     /* PIXMAN_OP_SRC */
5855     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5856     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5857     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5858     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5859     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5860     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5861     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5862     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5863     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5864     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5865     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5866     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5867     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5868     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5869
5870     /* PIXMAN_OP_IN */
5871     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5872     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5873     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5874
5875     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5876     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5877     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5878     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5879     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5880     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5881     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5882     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5883     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5884     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5885     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5886     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5887
5888     { PIXMAN_OP_NONE },
5889 };
5890
5891 static pixman_bool_t
5892 sse2_blt (pixman_implementation_t *imp,
5893           uint32_t *               src_bits,
5894           uint32_t *               dst_bits,
5895           int                      src_stride,
5896           int                      dst_stride,
5897           int                      src_bpp,
5898           int                      dst_bpp,
5899           int                      src_x,
5900           int                      src_y,
5901           int                      dst_x,
5902           int                      dst_y,
5903           int                      width,
5904           int                      height)
5905 {
5906     if (!pixman_blt_sse2 (
5907             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5908             src_x, src_y, dst_x, dst_y, width, height))
5909
5910     {
5911         return _pixman_implementation_blt (
5912             imp->delegate,
5913             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5914             src_x, src_y, dst_x, dst_y, width, height);
5915     }
5916
5917     return TRUE;
5918 }
5919
5920 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5921 __attribute__((__force_align_arg_pointer__))
5922 #endif
5923 static pixman_bool_t
5924 sse2_fill (pixman_implementation_t *imp,
5925            uint32_t *               bits,
5926            int                      stride,
5927            int                      bpp,
5928            int                      x,
5929            int                      y,
5930            int                      width,
5931            int                      height,
5932            uint32_t xor)
5933 {
5934     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5935     {
5936         return _pixman_implementation_fill (
5937             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5938     }
5939
5940     return TRUE;
5941 }
5942
5943 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5944 __attribute__((__force_align_arg_pointer__))
5945 #endif
5946 pixman_implementation_t *
5947 _pixman_implementation_create_sse2 (void)
5948 {
5949 #ifdef USE_MMX
5950     pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
5951 #else
5952     pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
5953 #endif
5954     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
5955
5956     /* SSE2 constants */
5957     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5958     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5959     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5960     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5961     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5962     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5963     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5964     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5965     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
5966     mask_0080 = create_mask_16_128 (0x0080);
5967     mask_00ff = create_mask_16_128 (0x00ff);
5968     mask_0101 = create_mask_16_128 (0x0101);
5969     mask_ffff = create_mask_16_128 (0xffff);
5970     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5971     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5972
5973     /* MMX constants */
5974     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5975     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5976
5977     mask_x0080 = create_mask_16_64 (0x0080);
5978     mask_x00ff = create_mask_16_64 (0x00ff);
5979     mask_x0101 = create_mask_16_64 (0x0101);
5980     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5981
5982     _mm_empty ();
5983
5984     /* Set up function pointers */
5985
5986     /* SSE code patch for fbcompose.c */
5987     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5988     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5989     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5990     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5991     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5992     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5993     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5994     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5995     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5996     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5997
5998     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5999
6000     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6001     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6002     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6003     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6004     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6005     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6006     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6007     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6008     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6009     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6010     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6011
6012     imp->blt = sse2_blt;
6013     imp->fill = sse2_fill;
6014
6015     return imp;
6016 }
6017
6018 #endif /* USE_SSE2 */