pixman/pixman-sse2.c

   1 /*
   2  * Copyright © 2008 Rodrigo Kumpera
   3  * Copyright © 2008 André Tupinambá
   4  *
   5  * Permission to use, copy, modify, distribute, and sell this software and its
   6  * documentation for any purpose is hereby granted without fee, provided that
   7  * the above copyright notice appear in all copies and that both that
   8  * copyright notice and this permission notice appear in supporting
   9  * documentation, and that the name of Red Hat not be used in advertising or
  10  * publicity pertaining to distribution of the software without specific,
  11  * written prior permission.  Red Hat makes no representations about the
  12  * suitability of this software for any purpose.  It is provided "as is"
  13  * without express or implied warranty.
  14  *
  15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  22  * SOFTWARE.
  23  *
  24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
  25  *          André Tupinambá (andrelrt@gmail.com)
  26  *
  27  * Based on work by Owen Taylor and Søren Sandmann
  28  */
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include <mmintrin.h>
  34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
  35 #include <emmintrin.h> /* for SSE2 intrinsics */
  36 #include "pixman-private.h"
  37 #include "pixman-combine32.h"
  38 #include "pixman-fast-path.h"
  39
  40 #if defined(_MSC_VER) && defined(_M_AMD64)
  41 /* Windows 64 doesn't allow MMX to be used, so
  42  * the pixman-x64-mmx-emulation.h file contains
  43  * implementations of those MMX intrinsics that
  44  * are used in the SSE2 implementation.
  45  */
  46 #   include "pixman-x64-mmx-emulation.h"
  47 #endif
  48
  49 #ifdef USE_SSE2
  50
  51 /* --------------------------------------------------------------------
  52  * Locals
  53  */
  54
  55 static __m64 mask_x0080;
  56 static __m64 mask_x00ff;
  57 static __m64 mask_x0101;
  58 static __m64 mask_x_alpha;
  59
  60 static __m64 mask_x565_rgb;
  61 static __m64 mask_x565_unpack;
  62
  63 static __m128i mask_0080;
  64 static __m128i mask_00ff;
  65 static __m128i mask_0101;
  66 static __m128i mask_ffff;
  67 static __m128i mask_ff000000;
  68 static __m128i mask_alpha;
  69
  70 static __m128i mask_565_r;
  71 static __m128i mask_565_g1, mask_565_g2;
  72 static __m128i mask_565_b;
  73 static __m128i mask_red;
  74 static __m128i mask_green;
  75 static __m128i mask_blue;
  76
  77 static __m128i mask_565_fix_rb;
  78 static __m128i mask_565_fix_g;
  79
  80 /* ----------------------------------------------------------------------
  81  * SSE2 Inlines
  82  */
  83 static force_inline __m128i
  84 unpack_32_1x128 (uint32_t data)
  85 {
  86     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
  87 }
  88
  89 static force_inline void
  90 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
  91 {
  92     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
  93     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
  94 }
  95
  96 static force_inline __m128i
  97 unpack_565_to_8888 (__m128i lo)
  98 {
  99     __m128i r, g, b, rb, t;
 100
 101     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
 102     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
 103     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
 104
 105     rb = _mm_or_si128 (r, b);
 106     t  = _mm_and_si128 (rb, mask_565_fix_rb);
 107     t  = _mm_srli_epi32 (t, 5);
 108     rb = _mm_or_si128 (rb, t);
 109
 110     t  = _mm_and_si128 (g, mask_565_fix_g);
 111     t  = _mm_srli_epi32 (t, 6);
 112     g  = _mm_or_si128 (g, t);
 113
 114     return _mm_or_si128 (rb, g);
 115 }
 116
 117 static force_inline void
 118 unpack_565_128_4x128 (__m128i  data,
 119                       __m128i* data0,
 120                       __m128i* data1,
 121                       __m128i* data2,
 122                       __m128i* data3)
 123 {
 124     __m128i lo, hi;
 125
 126     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
 127     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
 128
 129     lo = unpack_565_to_8888 (lo);
 130     hi = unpack_565_to_8888 (hi);
 131
 132     unpack_128_2x128 (lo, data0, data1);
 133     unpack_128_2x128 (hi, data2, data3);
 134 }
 135
 136 static force_inline uint16_t
 137 pack_565_32_16 (uint32_t pixel)
 138 {
 139     return (uint16_t) (((pixel >> 8) & 0xf800) |
 140                        ((pixel >> 5) & 0x07e0) |
 141                        ((pixel >> 3) & 0x001f));
 142 }
 143
 144 static force_inline __m128i
 145 pack_2x128_128 (__m128i lo, __m128i hi)
 146 {
 147     return _mm_packus_epi16 (lo, hi);
 148 }
 149
 150 static force_inline __m128i
 151 pack_565_2x128_128 (__m128i lo, __m128i hi)
 152 {
 153     __m128i data;
 154     __m128i r, g1, g2, b;
 155
 156     data = pack_2x128_128 (lo, hi);
 157
 158     r  = _mm_and_si128 (data, mask_565_r);
 159     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
 160     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
 161     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
 162
 163     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
 164 }
 165
 166 static force_inline __m128i
 167 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
 168 {
 169     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
 170                              pack_565_2x128_128 (*xmm2, *xmm3));
 171 }
 172
 173 static force_inline int
 174 is_opaque (__m128i x)
 175 {
 176     __m128i ffs = _mm_cmpeq_epi8 (x, x);
 177
 178     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
 179 }
 180
 181 static force_inline int
 182 is_zero (__m128i x)
 183 {
 184     return _mm_movemask_epi8 (
 185         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
 186 }
 187
 188 static force_inline int
 189 is_transparent (__m128i x)
 190 {
 191     return (_mm_movemask_epi8 (
 192                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
 193 }
 194
 195 static force_inline __m128i
 196 expand_pixel_32_1x128 (uint32_t data)
 197 {
 198     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
 199 }
 200
 201 static force_inline __m128i
 202 expand_alpha_1x128 (__m128i data)
 203 {
 204     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
 205                                                      _MM_SHUFFLE (3, 3, 3, 3)),
 206                                 _MM_SHUFFLE (3, 3, 3, 3));
 207 }
 208
 209 static force_inline void
 210 expand_alpha_2x128 (__m128i  data_lo,
 211                     __m128i  data_hi,
 212                     __m128i* alpha_lo,
 213                     __m128i* alpha_hi)
 214 {
 215     __m128i lo, hi;
 216
 217     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
 218     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
 219
 220     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
 221     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
 222 }
 223
 224 static force_inline void
 225 expand_alpha_rev_2x128 (__m128i  data_lo,
 226                         __m128i  data_hi,
 227                         __m128i* alpha_lo,
 228                         __m128i* alpha_hi)
 229 {
 230     __m128i lo, hi;
 231
 232     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
 233     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
 234     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
 235     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
 236 }
 237
 238 static force_inline void
 239 pix_multiply_2x128 (__m128i* data_lo,
 240                     __m128i* data_hi,
 241                     __m128i* alpha_lo,
 242                     __m128i* alpha_hi,
 243                     __m128i* ret_lo,
 244                     __m128i* ret_hi)
 245 {
 246     __m128i lo, hi;
 247
 248     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
 249     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
 250     lo = _mm_adds_epu16 (lo, mask_0080);
 251     hi = _mm_adds_epu16 (hi, mask_0080);
 252     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
 253     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
 254 }
 255
 256 static force_inline void
 257 pix_add_multiply_2x128 (__m128i* src_lo,
 258                         __m128i* src_hi,
 259                         __m128i* alpha_dst_lo,
 260                         __m128i* alpha_dst_hi,
 261                         __m128i* dst_lo,
 262                         __m128i* dst_hi,
 263                         __m128i* alpha_src_lo,
 264                         __m128i* alpha_src_hi,
 265                         __m128i* ret_lo,
 266                         __m128i* ret_hi)
 267 {
 268     __m128i t1_lo, t1_hi;
 269     __m128i t2_lo, t2_hi;
 270
 271     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
 272     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
 273
 274     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
 275     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
 276 }
 277
 278 static force_inline void
 279 negate_2x128 (__m128i  data_lo,
 280               __m128i  data_hi,
 281               __m128i* neg_lo,
 282               __m128i* neg_hi)
 283 {
 284     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
 285     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
 286 }
 287
 288 static force_inline void
 289 invert_colors_2x128 (__m128i  data_lo,
 290                      __m128i  data_hi,
 291                      __m128i* inv_lo,
 292                      __m128i* inv_hi)
 293 {
 294     __m128i lo, hi;
 295
 296     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
 297     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
 298     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
 299     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
 300 }
 301
 302 static force_inline void
 303 over_2x128 (__m128i* src_lo,
 304             __m128i* src_hi,
 305             __m128i* alpha_lo,
 306             __m128i* alpha_hi,
 307             __m128i* dst_lo,
 308             __m128i* dst_hi)
 309 {
 310     __m128i t1, t2;
 311
 312     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
 313
 314     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
 315
 316     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
 317     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
 318 }
 319
 320 static force_inline void
 321 over_rev_non_pre_2x128 (__m128i  src_lo,
 322                         __m128i  src_hi,
 323                         __m128i* dst_lo,
 324                         __m128i* dst_hi)
 325 {
 326     __m128i lo, hi;
 327     __m128i alpha_lo, alpha_hi;
 328
 329     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
 330
 331     lo = _mm_or_si128 (alpha_lo, mask_alpha);
 332     hi = _mm_or_si128 (alpha_hi, mask_alpha);
 333
 334     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
 335
 336     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
 337
 338     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
 339 }
 340
 341 static force_inline void
 342 in_over_2x128 (__m128i* src_lo,
 343                __m128i* src_hi,
 344                __m128i* alpha_lo,
 345                __m128i* alpha_hi,
 346                __m128i* mask_lo,
 347                __m128i* mask_hi,
 348                __m128i* dst_lo,
 349                __m128i* dst_hi)
 350 {
 351     __m128i s_lo, s_hi;
 352     __m128i a_lo, a_hi;
 353
 354     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
 355     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
 356
 357     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 358 }
 359
 360 /* load 4 pixels from a 16-byte boundary aligned address */
 361 static force_inline __m128i
 362 load_128_aligned (__m128i* src)
 363 {
 364     return _mm_load_si128 (src);
 365 }
 366
 367 /* load 4 pixels from a unaligned address */
 368 static force_inline __m128i
 369 load_128_unaligned (const __m128i* src)
 370 {
 371     return _mm_loadu_si128 (src);
 372 }
 373
 374 /* save 4 pixels using Write Combining memory on a 16-byte
 375  * boundary aligned address
 376  */
 377 static force_inline void
 378 save_128_write_combining (__m128i* dst,
 379                           __m128i  data)
 380 {
 381     _mm_stream_si128 (dst, data);
 382 }
 383
 384 /* save 4 pixels on a 16-byte boundary aligned address */
 385 static force_inline void
 386 save_128_aligned (__m128i* dst,
 387                   __m128i  data)
 388 {
 389     _mm_store_si128 (dst, data);
 390 }
 391
 392 /* save 4 pixels on a unaligned address */
 393 static force_inline void
 394 save_128_unaligned (__m128i* dst,
 395                     __m128i  data)
 396 {
 397     _mm_storeu_si128 (dst, data);
 398 }
 399
 400 /* ------------------------------------------------------------------
 401  * MMX inlines
 402  */
 403
 404 static force_inline __m64
 405 load_32_1x64 (uint32_t data)
 406 {
 407     return _mm_cvtsi32_si64 (data);
 408 }
 409
 410 static force_inline __m128i
 411 load_32_1x128 (uint32_t data)
 412 {
 413     return _mm_cvtsi32_si128 (data);
 414 }
 415
 416 static force_inline __m64
 417 unpack_32_1x64 (uint32_t data)
 418 {
 419     return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
 420 }
 421
 422 static force_inline __m64
 423 expand_alpha_1x64 (__m64 data)
 424 {
 425     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
 426 }
 427
 428 static force_inline __m64
 429 expand_alpha_rev_1x64 (__m64 data)
 430 {
 431     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 432 }
 433
 434 static force_inline __m128i
 435 expand_alpha_rev_1x128 (__m128i data)
 436 {
 437     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 438 }
 439
 440 static force_inline __m64
 441 expand_pixel_8_1x64 (uint8_t data)
 442 {
 443     return _mm_shuffle_pi16 (
 444         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 445 }
 446
 447 static force_inline __m128i
 448 expand_pixel_8_1x128 (uint8_t data)
 449 {
 450     return _mm_shufflelo_epi16 (
 451         unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 452 }
 453
 454 static force_inline __m64
 455 pix_multiply_1x64 (__m64 data,
 456                    __m64 alpha)
 457 {
 458     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
 459                                           mask_x0080),
 460                            mask_x0101);
 461 }
 462
 463 static force_inline __m128i
 464 pix_multiply_1x128 (__m128i data,
 465                     __m128i alpha)
 466 {
 467     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
 468                                             mask_0080),
 469                             mask_0101);
 470 }
 471
 472 static force_inline __m64
 473 pix_add_multiply_1x64 (__m64* src,
 474                        __m64* alpha_dst,
 475                        __m64* dst,
 476                        __m64* alpha_src)
 477 {
 478     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
 479     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
 480
 481     return _mm_adds_pu8 (t1, t2);
 482 }
 483
 484 static force_inline __m128i
 485 pix_add_multiply_1x128 (__m128i* src,
 486                         __m128i* alpha_dst,
 487                         __m128i* dst,
 488                         __m128i* alpha_src)
 489 {
 490     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
 491     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
 492
 493     return _mm_adds_epu8 (t1, t2);
 494 }
 495
 496 static force_inline __m64
 497 negate_1x64 (__m64 data)
 498 {
 499     return _mm_xor_si64 (data, mask_x00ff);
 500 }
 501
 502 static force_inline __m128i
 503 negate_1x128 (__m128i data)
 504 {
 505     return _mm_xor_si128 (data, mask_00ff);
 506 }
 507
 508 static force_inline __m64
 509 invert_colors_1x64 (__m64 data)
 510 {
 511     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 512 }
 513
 514 static force_inline __m128i
 515 invert_colors_1x128 (__m128i data)
 516 {
 517     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 518 }
 519
 520 static force_inline __m64
 521 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
 522 {
 523     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
 524 }
 525
 526 static force_inline __m128i
 527 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
 528 {
 529     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
 530 }
 531
 532 static force_inline __m64
 533 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
 534 {
 535     return over_1x64 (pix_multiply_1x64 (*src, *mask),
 536                       pix_multiply_1x64 (*alpha, *mask),
 537                       *dst);
 538 }
 539
 540 static force_inline __m128i
 541 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
 542 {
 543     return over_1x128 (pix_multiply_1x128 (*src, *mask),
 544                        pix_multiply_1x128 (*alpha, *mask),
 545                        *dst);
 546 }
 547
 548 static force_inline __m64
 549 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
 550 {
 551     __m64 alpha = expand_alpha_1x64 (src);
 552
 553     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
 554                                          _mm_or_si64 (alpha, mask_x_alpha)),
 555                       alpha,
 556                       dst);
 557 }
 558
 559 static force_inline __m128i
 560 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
 561 {
 562     __m128i alpha = expand_alpha_1x128 (src);
 563
 564     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
 565                                            _mm_or_si128 (alpha, mask_alpha)),
 566                        alpha,
 567                        dst);
 568 }
 569
 570 static force_inline uint32_t
 571 pack_1x64_32 (__m64 data)
 572 {
 573     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
 574 }
 575
 576 static force_inline uint32_t
 577 pack_1x128_32 (__m128i data)
 578 {
 579     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
 580 }
 581
 582 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
 583  *
 584  *    00RR00GG00BB
 585  *
 586  * --- Expanding 565 in the low word ---
 587  *
 588  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
 589  * m = m & (01f0003f001f);
 590  * m = m * (008404100840);
 591  * m = m >> 8;
 592  *
 593  * Note the trick here - the top word is shifted by another nibble to
 594  * avoid it bumping into the middle word
 595  */
 596 static force_inline __m64
 597 expand565_16_1x64 (uint16_t pixel)
 598 {
 599     __m64 p;
 600     __m64 t1, t2;
 601
 602     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
 603
 604     t1 = _mm_slli_si64 (p, 36 - 11);
 605     t2 = _mm_slli_si64 (p, 16 - 5);
 606
 607     p = _mm_or_si64 (t1, p);
 608     p = _mm_or_si64 (t2, p);
 609     p = _mm_and_si64 (p, mask_x565_rgb);
 610     p = _mm_mullo_pi16 (p, mask_x565_unpack);
 611
 612     return _mm_srli_pi16 (p, 8);
 613 }
 614
 615 static force_inline __m128i
 616 expand565_16_1x128 (uint16_t pixel)
 617 {
 618     __m128i m = _mm_cvtsi32_si128 (pixel);
 619
 620     m = unpack_565_to_8888 (m);
 621
 622     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
 623 }
 624
 625 /* ----------------------------------------------------------------------------
 626  * Compose Core transformations
 627  */
 628 static force_inline uint32_t
 629 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 630 {
 631     uint8_t a;
 632     __m128i xmms;
 633
 634     a = src >> 24;
 635
 636     if (a == 0xff)
 637     {
 638         return src;
 639     }
 640     else if (src)
 641     {
 642         xmms = unpack_32_1x128 (src);
 643         return pack_1x128_32 (
 644             over_1x128 (xmms, expand_alpha_1x128 (xmms),
 645                         unpack_32_1x128 (dst)));
 646     }
 647
 648     return dst;
 649 }
 650
 651 static force_inline uint32_t
 652 combine1 (const uint32_t *ps, const uint32_t *pm)
 653 {
 654     uint32_t s = *ps;
 655
 656     if (pm)
 657     {
 658         __m128i ms, mm;
 659
 660         mm = unpack_32_1x128 (*pm);
 661         mm = expand_alpha_1x128 (mm);
 662
 663         ms = unpack_32_1x128 (s);
 664         ms = pix_multiply_1x128 (ms, mm);
 665
 666         s = pack_1x128_32 (ms);
 667     }
 668
 669     return s;
 670 }
 671
 672 static force_inline __m128i
 673 combine4 (const __m128i *ps, const __m128i *pm)
 674 {
 675     __m128i xmm_src_lo, xmm_src_hi;
 676     __m128i xmm_msk_lo, xmm_msk_hi;
 677     __m128i s;
 678
 679     if (pm)
 680     {
 681         xmm_msk_lo = load_128_unaligned (pm);
 682
 683         if (is_transparent (xmm_msk_lo))
 684             return _mm_setzero_si128 ();
 685     }
 686
 687     s = load_128_unaligned (ps);
 688
 689     if (pm)
 690     {
 691         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
 692         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
 693
 694         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
 695
 696         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
 697                             &xmm_msk_lo, &xmm_msk_hi,
 698                             &xmm_src_lo, &xmm_src_hi);
 699
 700         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
 701     }
 702
 703     return s;
 704 }
 705
 706 static force_inline void
 707 core_combine_over_u_sse2_mask (uint32_t *         pd,
 708                                const uint32_t*    ps,
 709                                const uint32_t*    pm,
 710                                int                w)
 711 {
 712     uint32_t s, d;
 713
 714     /* Align dst on a 16-byte boundary */
 715     while (w && ((unsigned long)pd & 15))
 716     {
 717         d = *pd;
 718         s = combine1 (ps, pm);
 719
 720         if (s)
 721             *pd = core_combine_over_u_pixel_sse2 (s, d);
 722         pd++;
 723         ps++;
 724         pm++;
 725         w--;
 726     }
 727
 728     while (w >= 4)
 729     {
 730         __m128i mask = load_128_unaligned ((__m128i *)pm);
 731
 732         if (!is_zero (mask))
 733         {
 734             __m128i src;
 735             __m128i src_hi, src_lo;
 736             __m128i mask_hi, mask_lo;
 737             __m128i alpha_hi, alpha_lo;
 738
 739             src = load_128_unaligned ((__m128i *)ps);
 740
 741             if (is_opaque (_mm_and_si128 (src, mask)))
 742             {
 743                 save_128_aligned ((__m128i *)pd, src);
 744             }
 745             else
 746             {
 747                 __m128i dst = load_128_aligned ((__m128i *)pd);
 748                 __m128i dst_hi, dst_lo;
 749
 750                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
 751                 unpack_128_2x128 (src, &src_lo, &src_hi);
 752
 753                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
 754                 pix_multiply_2x128 (&src_lo, &src_hi,
 755                                     &mask_lo, &mask_hi,
 756                                     &src_lo, &src_hi);
 757
 758                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
 759
 760                 expand_alpha_2x128 (src_lo, src_hi,
 761                                     &alpha_lo, &alpha_hi);
 762
 763                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
 764                             &dst_lo, &dst_hi);
 765
 766                 save_128_aligned (
 767                     (__m128i *)pd,
 768                     pack_2x128_128 (dst_lo, dst_hi));
 769             }
 770         }
 771
 772         pm += 4;
 773         ps += 4;
 774         pd += 4;
 775         w -= 4;
 776     }
 777     while (w)
 778     {
 779         d = *pd;
 780         s = combine1 (ps, pm);
 781
 782         if (s)
 783             *pd = core_combine_over_u_pixel_sse2 (s, d);
 784         pd++;
 785         ps++;
 786         pm++;
 787
 788         w--;
 789     }
 790 }
 791
 792 static force_inline void
 793 core_combine_over_u_sse2_no_mask (uint32_t *      pd,
 794                                   const uint32_t*    ps,
 795                                   int                w)
 796 {
 797     uint32_t s, d;
 798
 799     /* Align dst on a 16-byte boundary */
 800     while (w && ((unsigned long)pd & 15))
 801     {
 802         d = *pd;
 803         s = *ps;
 804
 805         if (s)
 806             *pd = core_combine_over_u_pixel_sse2 (s, d);
 807         pd++;
 808         ps++;
 809         w--;
 810     }
 811
 812     while (w >= 4)
 813     {
 814         __m128i src;
 815         __m128i src_hi, src_lo, dst_hi, dst_lo;
 816         __m128i alpha_hi, alpha_lo;
 817
 818         src = load_128_unaligned ((__m128i *)ps);
 819
 820         if (!is_zero (src))
 821         {
 822             if (is_opaque (src))
 823             {
 824                 save_128_aligned ((__m128i *)pd, src);
 825             }
 826             else
 827             {
 828                 __m128i dst = load_128_aligned ((__m128i *)pd);
 829
 830                 unpack_128_2x128 (src, &src_lo, &src_hi);
 831                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
 832
 833                 expand_alpha_2x128 (src_lo, src_hi,
 834                                     &alpha_lo, &alpha_hi);
 835                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
 836                             &dst_lo, &dst_hi);
 837
 838                 save_128_aligned (
 839                     (__m128i *)pd,
 840                     pack_2x128_128 (dst_lo, dst_hi));
 841             }
 842         }
 843
 844         ps += 4;
 845         pd += 4;
 846         w -= 4;
 847     }
 848     while (w)
 849     {
 850         d = *pd;
 851         s = *ps;
 852
 853         if (s)
 854             *pd = core_combine_over_u_pixel_sse2 (s, d);
 855         pd++;
 856         ps++;
 857
 858         w--;
 859     }
 860 }
 861
 862 static force_inline void
 863 core_combine_over_u_sse2 (uint32_t*       pd,
 864                           const uint32_t* ps,
 865                           const uint32_t* pm,
 866                           int             w)
 867 {
 868     if (pm)
 869         core_combine_over_u_sse2_mask (pd, ps, pm, w);
 870     else
 871         core_combine_over_u_sse2_no_mask (pd, ps, w);
 872 }
 873
 874 static force_inline void
 875 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
 876                                   const uint32_t* ps,
 877                                   const uint32_t* pm,
 878                                   int             w)
 879 {
 880     uint32_t s, d;
 881
 882     __m128i xmm_dst_lo, xmm_dst_hi;
 883     __m128i xmm_src_lo, xmm_src_hi;
 884     __m128i xmm_alpha_lo, xmm_alpha_hi;
 885
 886     /* Align dst on a 16-byte boundary */
 887     while (w &&
 888            ((unsigned long)pd & 15))
 889     {
 890         d = *pd;
 891         s = combine1 (ps, pm);
 892
 893         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
 894         w--;
 895         ps++;
 896         if (pm)
 897             pm++;
 898     }
 899
 900     while (w >= 4)
 901     {
 902         /* I'm loading unaligned because I'm not sure
 903          * about the address alignment.
 904          */
 905         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 906         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 907
 908         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 909         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 910
 911         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
 912                             &xmm_alpha_lo, &xmm_alpha_hi);
 913
 914         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 915                     &xmm_alpha_lo, &xmm_alpha_hi,
 916                     &xmm_src_lo, &xmm_src_hi);
 917
 918         /* rebuid the 4 pixel data and save*/
 919         save_128_aligned ((__m128i*)pd,
 920                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
 921
 922         w -= 4;
 923         ps += 4;
 924         pd += 4;
 925
 926         if (pm)
 927             pm += 4;
 928     }
 929
 930     while (w)
 931     {
 932         d = *pd;
 933         s = combine1 (ps, pm);
 934
 935         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
 936         ps++;
 937         w--;
 938         if (pm)
 939             pm++;
 940     }
 941 }
 942
 943 static force_inline uint32_t
 944 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
 945 {
 946     uint32_t maska = src >> 24;
 947
 948     if (maska == 0)
 949     {
 950         return 0;
 951     }
 952     else if (maska != 0xff)
 953     {
 954         return pack_1x128_32 (
 955             pix_multiply_1x128 (unpack_32_1x128 (dst),
 956                                 expand_alpha_1x128 (unpack_32_1x128 (src))));
 957     }
 958
 959     return dst;
 960 }
 961
 962 static force_inline void
 963 core_combine_in_u_sse2 (uint32_t*       pd,
 964                         const uint32_t* ps,
 965                         const uint32_t* pm,
 966                         int             w)
 967 {
 968     uint32_t s, d;
 969
 970     __m128i xmm_src_lo, xmm_src_hi;
 971     __m128i xmm_dst_lo, xmm_dst_hi;
 972
 973     while (w && ((unsigned long) pd & 15))
 974     {
 975         s = combine1 (ps, pm);
 976         d = *pd;
 977
 978         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
 979         w--;
 980         ps++;
 981         if (pm)
 982             pm++;
 983     }
 984
 985     while (w >= 4)
 986     {
 987         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 988         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 989
 990         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 991         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 992
 993         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 994         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
 995                             &xmm_dst_lo, &xmm_dst_hi,
 996                             &xmm_dst_lo, &xmm_dst_hi);
 997
 998         save_128_aligned ((__m128i*)pd,
 999                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1000
1001         ps += 4;
1002         pd += 4;
1003         w -= 4;
1004         if (pm)
1005             pm += 4;
1006     }
1007
1008     while (w)
1009     {
1010         s = combine1 (ps, pm);
1011         d = *pd;
1012
1013         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
1014         w--;
1015         ps++;
1016         if (pm)
1017             pm++;
1018     }
1019 }
1020
1021 static force_inline void
1022 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
1023                                 const uint32_t* ps,
1024                                 const uint32_t *pm,
1025                                 int             w)
1026 {
1027     uint32_t s, d;
1028
1029     __m128i xmm_src_lo, xmm_src_hi;
1030     __m128i xmm_dst_lo, xmm_dst_hi;
1031
1032     while (w && ((unsigned long) pd & 15))
1033     {
1034         s = combine1 (ps, pm);
1035         d = *pd;
1036
1037         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
1038         ps++;
1039         w--;
1040         if (pm)
1041             pm++;
1042     }
1043
1044     while (w >= 4)
1045     {
1046         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1047         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1048
1049         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1050         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1051
1052         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1053         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1054                             &xmm_src_lo, &xmm_src_hi,
1055                             &xmm_dst_lo, &xmm_dst_hi);
1056
1057         save_128_aligned (
1058             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1059
1060         ps += 4;
1061         pd += 4;
1062         w -= 4;
1063         if (pm)
1064             pm += 4;
1065     }
1066
1067     while (w)
1068     {
1069         s = combine1 (ps, pm);
1070         d = *pd;
1071
1072         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
1073         w--;
1074         ps++;
1075         if (pm)
1076             pm++;
1077     }
1078 }
1079
1080 static force_inline void
1081 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
1082                                  const uint32_t* ps,
1083                                  const uint32_t* pm,
1084                                  int             w)
1085 {
1086     while (w && ((unsigned long) pd & 15))
1087     {
1088         uint32_t s = combine1 (ps, pm);
1089         uint32_t d = *pd;
1090
1091         *pd++ = pack_1x128_32 (
1092             pix_multiply_1x128 (
1093                 unpack_32_1x128 (d), negate_1x128 (
1094                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
1095
1096         if (pm)
1097             pm++;
1098         ps++;
1099         w--;
1100     }
1101
1102     while (w >= 4)
1103     {
1104         __m128i xmm_src_lo, xmm_src_hi;
1105         __m128i xmm_dst_lo, xmm_dst_hi;
1106
1107         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1108         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1109
1110         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1111         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1112
1113         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1114         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1115
1116         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1117                             &xmm_src_lo, &xmm_src_hi,
1118                             &xmm_dst_lo, &xmm_dst_hi);
1119
1120         save_128_aligned (
1121             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1122
1123         ps += 4;
1124         pd += 4;
1125         if (pm)
1126             pm += 4;
1127
1128         w -= 4;
1129     }
1130
1131     while (w)
1132     {
1133         uint32_t s = combine1 (ps, pm);
1134         uint32_t d = *pd;
1135
1136         *pd++ = pack_1x128_32 (
1137             pix_multiply_1x128 (
1138                 unpack_32_1x128 (d), negate_1x128 (
1139                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
1140         ps++;
1141         if (pm)
1142             pm++;
1143         w--;
1144     }
1145 }
1146
1147 static force_inline void
1148 core_combine_out_u_sse2 (uint32_t*       pd,
1149                          const uint32_t* ps,
1150                          const uint32_t* pm,
1151                          int             w)
1152 {
1153     while (w && ((unsigned long) pd & 15))
1154     {
1155         uint32_t s = combine1 (ps, pm);
1156         uint32_t d = *pd;
1157
1158         *pd++ = pack_1x128_32 (
1159             pix_multiply_1x128 (
1160                 unpack_32_1x128 (s), negate_1x128 (
1161                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1162         w--;
1163         ps++;
1164         if (pm)
1165             pm++;
1166     }
1167
1168     while (w >= 4)
1169     {
1170         __m128i xmm_src_lo, xmm_src_hi;
1171         __m128i xmm_dst_lo, xmm_dst_hi;
1172
1173         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1174         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1175
1176         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1177         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1178
1179         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1180         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1181
1182         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1183                             &xmm_dst_lo, &xmm_dst_hi,
1184                             &xmm_dst_lo, &xmm_dst_hi);
1185
1186         save_128_aligned (
1187             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1188
1189         ps += 4;
1190         pd += 4;
1191         w -= 4;
1192         if (pm)
1193             pm += 4;
1194     }
1195
1196     while (w)
1197     {
1198         uint32_t s = combine1 (ps, pm);
1199         uint32_t d = *pd;
1200
1201         *pd++ = pack_1x128_32 (
1202             pix_multiply_1x128 (
1203                 unpack_32_1x128 (s), negate_1x128 (
1204                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1205         w--;
1206         ps++;
1207         if (pm)
1208             pm++;
1209     }
1210 }
1211
1212 static force_inline uint32_t
1213 core_combine_atop_u_pixel_sse2 (uint32_t src,
1214                                 uint32_t dst)
1215 {
1216     __m128i s = unpack_32_1x128 (src);
1217     __m128i d = unpack_32_1x128 (dst);
1218
1219     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1220     __m128i da = expand_alpha_1x128 (d);
1221
1222     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1223 }
1224
1225 static force_inline void
1226 core_combine_atop_u_sse2 (uint32_t*       pd,
1227                           const uint32_t* ps,
1228                           const uint32_t* pm,
1229                           int             w)
1230 {
1231     uint32_t s, d;
1232
1233     __m128i xmm_src_lo, xmm_src_hi;
1234     __m128i xmm_dst_lo, xmm_dst_hi;
1235     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1236     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1237
1238     while (w && ((unsigned long) pd & 15))
1239     {
1240         s = combine1 (ps, pm);
1241         d = *pd;
1242
1243         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1244         w--;
1245         ps++;
1246         if (pm)
1247             pm++;
1248     }
1249
1250     while (w >= 4)
1251     {
1252         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1253         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1254
1255         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1256         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1257
1258         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1259                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1260         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1261                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1262
1263         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1264                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1265
1266         pix_add_multiply_2x128 (
1267             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1268             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1269             &xmm_dst_lo, &xmm_dst_hi);
1270
1271         save_128_aligned (
1272             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1273
1274         ps += 4;
1275         pd += 4;
1276         w -= 4;
1277         if (pm)
1278             pm += 4;
1279     }
1280
1281     while (w)
1282     {
1283         s = combine1 (ps, pm);
1284         d = *pd;
1285
1286         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1287         w--;
1288         ps++;
1289         if (pm)
1290             pm++;
1291     }
1292 }
1293
1294 static force_inline uint32_t
1295 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1296                                         uint32_t dst)
1297 {
1298     __m128i s = unpack_32_1x128 (src);
1299     __m128i d = unpack_32_1x128 (dst);
1300
1301     __m128i sa = expand_alpha_1x128 (s);
1302     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1303
1304     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1305 }
1306
1307 static force_inline void
1308 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1309                                   const uint32_t* ps,
1310                                   const uint32_t* pm,
1311                                   int             w)
1312 {
1313     uint32_t s, d;
1314
1315     __m128i xmm_src_lo, xmm_src_hi;
1316     __m128i xmm_dst_lo, xmm_dst_hi;
1317     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1318     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1319
1320     while (w && ((unsigned long) pd & 15))
1321     {
1322         s = combine1 (ps, pm);
1323         d = *pd;
1324
1325         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1326         ps++;
1327         w--;
1328         if (pm)
1329             pm++;
1330     }
1331
1332     while (w >= 4)
1333     {
1334         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1335         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1336
1337         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1338         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1339
1340         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1341                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1342         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1343                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1344
1345         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1346                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1347
1348         pix_add_multiply_2x128 (
1349             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1350             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1351             &xmm_dst_lo, &xmm_dst_hi);
1352
1353         save_128_aligned (
1354             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1355
1356         ps += 4;
1357         pd += 4;
1358         w -= 4;
1359         if (pm)
1360             pm += 4;
1361     }
1362
1363     while (w)
1364     {
1365         s = combine1 (ps, pm);
1366         d = *pd;
1367
1368         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1369         ps++;
1370         w--;
1371         if (pm)
1372             pm++;
1373     }
1374 }
1375
1376 static force_inline uint32_t
1377 core_combine_xor_u_pixel_sse2 (uint32_t src,
1378                                uint32_t dst)
1379 {
1380     __m128i s = unpack_32_1x128 (src);
1381     __m128i d = unpack_32_1x128 (dst);
1382
1383     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1384     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1385
1386     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1387 }
1388
1389 static force_inline void
1390 core_combine_xor_u_sse2 (uint32_t*       dst,
1391                          const uint32_t* src,
1392                          const uint32_t *mask,
1393                          int             width)
1394 {
1395     int w = width;
1396     uint32_t s, d;
1397     uint32_t* pd = dst;
1398     const uint32_t* ps = src;
1399     const uint32_t* pm = mask;
1400
1401     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1402     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1403     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1404     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1405
1406     while (w && ((unsigned long) pd & 15))
1407     {
1408         s = combine1 (ps, pm);
1409         d = *pd;
1410
1411         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1412         w--;
1413         ps++;
1414         if (pm)
1415             pm++;
1416     }
1417
1418     while (w >= 4)
1419     {
1420         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1421         xmm_dst = load_128_aligned ((__m128i*) pd);
1422
1423         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1424         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1425
1426         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1427                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1428         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1429                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1430
1431         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1432                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1433         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1434                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1435
1436         pix_add_multiply_2x128 (
1437             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1438             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1439             &xmm_dst_lo, &xmm_dst_hi);
1440
1441         save_128_aligned (
1442             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1443
1444         ps += 4;
1445         pd += 4;
1446         w -= 4;
1447         if (pm)
1448             pm += 4;
1449     }
1450
1451     while (w)
1452     {
1453         s = combine1 (ps, pm);
1454         d = *pd;
1455
1456         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1457         w--;
1458         ps++;
1459         if (pm)
1460             pm++;
1461     }
1462 }
1463
1464 static force_inline void
1465 core_combine_add_u_sse2 (uint32_t*       dst,
1466                          const uint32_t* src,
1467                          const uint32_t* mask,
1468                          int             width)
1469 {
1470     int w = width;
1471     uint32_t s, d;
1472     uint32_t* pd = dst;
1473     const uint32_t* ps = src;
1474     const uint32_t* pm = mask;
1475
1476     while (w && (unsigned long)pd & 15)
1477     {
1478         s = combine1 (ps, pm);
1479         d = *pd;
1480
1481         ps++;
1482         if (pm)
1483             pm++;
1484         *pd++ = _mm_cvtsi128_si32 (
1485             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1486         w--;
1487     }
1488
1489     while (w >= 4)
1490     {
1491         __m128i s;
1492
1493         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1494
1495         save_128_aligned (
1496             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1497
1498         pd += 4;
1499         ps += 4;
1500         if (pm)
1501             pm += 4;
1502         w -= 4;
1503     }
1504
1505     while (w--)
1506     {
1507         s = combine1 (ps, pm);
1508         d = *pd;
1509
1510         ps++;
1511         *pd++ = _mm_cvtsi128_si32 (
1512             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1513         if (pm)
1514             pm++;
1515     }
1516 }
1517
1518 static force_inline uint32_t
1519 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1520                                     uint32_t dst)
1521 {
1522     __m128i ms = unpack_32_1x128 (src);
1523     __m128i md = unpack_32_1x128 (dst);
1524     uint32_t sa = src >> 24;
1525     uint32_t da = ~dst >> 24;
1526
1527     if (sa > da)
1528     {
1529         ms = pix_multiply_1x128 (
1530             ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1531     }
1532
1533     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1534 }
1535
1536 static force_inline void
1537 core_combine_saturate_u_sse2 (uint32_t *      pd,
1538                               const uint32_t *ps,
1539                               const uint32_t *pm,
1540                               int             w)
1541 {
1542     uint32_t s, d;
1543
1544     uint32_t pack_cmp;
1545     __m128i xmm_src, xmm_dst;
1546
1547     while (w && (unsigned long)pd & 15)
1548     {
1549         s = combine1 (ps, pm);
1550         d = *pd;
1551
1552         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1553         w--;
1554         ps++;
1555         if (pm)
1556             pm++;
1557     }
1558
1559     while (w >= 4)
1560     {
1561         xmm_dst = load_128_aligned  ((__m128i*)pd);
1562         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1563
1564         pack_cmp = _mm_movemask_epi8 (
1565             _mm_cmpgt_epi32 (
1566                 _mm_srli_epi32 (xmm_src, 24),
1567                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1568
1569         /* if some alpha src is grater than respective ~alpha dst */
1570         if (pack_cmp)
1571         {
1572             s = combine1 (ps++, pm);
1573             d = *pd;
1574             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1575             if (pm)
1576                 pm++;
1577
1578             s = combine1 (ps++, pm);
1579             d = *pd;
1580             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1581             if (pm)
1582                 pm++;
1583
1584             s = combine1 (ps++, pm);
1585             d = *pd;
1586             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1587             if (pm)
1588                 pm++;
1589
1590             s = combine1 (ps++, pm);
1591             d = *pd;
1592             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1593             if (pm)
1594                 pm++;
1595         }
1596         else
1597         {
1598             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1599
1600             pd += 4;
1601             ps += 4;
1602             if (pm)
1603                 pm += 4;
1604         }
1605
1606         w -= 4;
1607     }
1608
1609     while (w--)
1610     {
1611         s = combine1 (ps, pm);
1612         d = *pd;
1613
1614         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1615         ps++;
1616         if (pm)
1617             pm++;
1618     }
1619 }
1620
1621 static force_inline void
1622 core_combine_src_ca_sse2 (uint32_t*       pd,
1623                           const uint32_t* ps,
1624                           const uint32_t *pm,
1625                           int             w)
1626 {
1627     uint32_t s, m;
1628
1629     __m128i xmm_src_lo, xmm_src_hi;
1630     __m128i xmm_mask_lo, xmm_mask_hi;
1631     __m128i xmm_dst_lo, xmm_dst_hi;
1632
1633     while (w && (unsigned long)pd & 15)
1634     {
1635         s = *ps++;
1636         m = *pm++;
1637         *pd++ = pack_1x128_32 (
1638             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1639         w--;
1640     }
1641
1642     while (w >= 4)
1643     {
1644         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1645         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1646
1647         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1648         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1649
1650         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1651                             &xmm_mask_lo, &xmm_mask_hi,
1652                             &xmm_dst_lo, &xmm_dst_hi);
1653
1654         save_128_aligned (
1655             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1656
1657         ps += 4;
1658         pd += 4;
1659         pm += 4;
1660         w -= 4;
1661     }
1662
1663     while (w)
1664     {
1665         s = *ps++;
1666         m = *pm++;
1667         *pd++ = pack_1x128_32 (
1668             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1669         w--;
1670     }
1671 }
1672
1673 static force_inline uint32_t
1674 core_combine_over_ca_pixel_sse2 (uint32_t src,
1675                                  uint32_t mask,
1676                                  uint32_t dst)
1677 {
1678     __m128i s = unpack_32_1x128 (src);
1679     __m128i expAlpha = expand_alpha_1x128 (s);
1680     __m128i unpk_mask = unpack_32_1x128 (mask);
1681     __m128i unpk_dst  = unpack_32_1x128 (dst);
1682
1683     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1684 }
1685
1686 static force_inline void
1687 core_combine_over_ca_sse2 (uint32_t*       pd,
1688                            const uint32_t* ps,
1689                            const uint32_t *pm,
1690                            int             w)
1691 {
1692     uint32_t s, m, d;
1693
1694     __m128i xmm_alpha_lo, xmm_alpha_hi;
1695     __m128i xmm_src_lo, xmm_src_hi;
1696     __m128i xmm_dst_lo, xmm_dst_hi;
1697     __m128i xmm_mask_lo, xmm_mask_hi;
1698
1699     while (w && (unsigned long)pd & 15)
1700     {
1701         s = *ps++;
1702         m = *pm++;
1703         d = *pd;
1704
1705         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1706         w--;
1707     }
1708
1709     while (w >= 4)
1710     {
1711         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1712         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1713         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1714
1715         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1716         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1717         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1718
1719         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1720                             &xmm_alpha_lo, &xmm_alpha_hi);
1721
1722         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1723                        &xmm_alpha_lo, &xmm_alpha_hi,
1724                        &xmm_mask_lo, &xmm_mask_hi,
1725                        &xmm_dst_lo, &xmm_dst_hi);
1726
1727         save_128_aligned (
1728             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1729
1730         ps += 4;
1731         pd += 4;
1732         pm += 4;
1733         w -= 4;
1734     }
1735
1736     while (w)
1737     {
1738         s = *ps++;
1739         m = *pm++;
1740         d = *pd;
1741
1742         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1743         w--;
1744     }
1745 }
1746
1747 static force_inline uint32_t
1748 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1749                                          uint32_t mask,
1750                                          uint32_t dst)
1751 {
1752     __m128i d = unpack_32_1x128 (dst);
1753
1754     return pack_1x128_32 (
1755         over_1x128 (d, expand_alpha_1x128 (d),
1756                     pix_multiply_1x128 (unpack_32_1x128 (src),
1757                                         unpack_32_1x128 (mask))));
1758 }
1759
1760 static force_inline void
1761 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1762                                    const uint32_t* ps,
1763                                    const uint32_t *pm,
1764                                    int             w)
1765 {
1766     uint32_t s, m, d;
1767
1768     __m128i xmm_alpha_lo, xmm_alpha_hi;
1769     __m128i xmm_src_lo, xmm_src_hi;
1770     __m128i xmm_dst_lo, xmm_dst_hi;
1771     __m128i xmm_mask_lo, xmm_mask_hi;
1772
1773     while (w && (unsigned long)pd & 15)
1774     {
1775         s = *ps++;
1776         m = *pm++;
1777         d = *pd;
1778
1779         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1780         w--;
1781     }
1782
1783     while (w >= 4)
1784     {
1785         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1786         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1787         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1788
1789         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1790         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1791         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1792
1793         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1794                             &xmm_alpha_lo, &xmm_alpha_hi);
1795         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1796                             &xmm_mask_lo, &xmm_mask_hi,
1797                             &xmm_mask_lo, &xmm_mask_hi);
1798
1799         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1800                     &xmm_alpha_lo, &xmm_alpha_hi,
1801                     &xmm_mask_lo, &xmm_mask_hi);
1802
1803         save_128_aligned (
1804             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1805
1806         ps += 4;
1807         pd += 4;
1808         pm += 4;
1809         w -= 4;
1810     }
1811
1812     while (w)
1813     {
1814         s = *ps++;
1815         m = *pm++;
1816         d = *pd;
1817
1818         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1819         w--;
1820     }
1821 }
1822
1823 static force_inline void
1824 core_combine_in_ca_sse2 (uint32_t *      pd,
1825                          const uint32_t *ps,
1826                          const uint32_t *pm,
1827                          int             w)
1828 {
1829     uint32_t s, m, d;
1830
1831     __m128i xmm_alpha_lo, xmm_alpha_hi;
1832     __m128i xmm_src_lo, xmm_src_hi;
1833     __m128i xmm_dst_lo, xmm_dst_hi;
1834     __m128i xmm_mask_lo, xmm_mask_hi;
1835
1836     while (w && (unsigned long)pd & 15)
1837     {
1838         s = *ps++;
1839         m = *pm++;
1840         d = *pd;
1841
1842         *pd++ = pack_1x128_32 (
1843             pix_multiply_1x128 (
1844                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1845                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1846
1847         w--;
1848     }
1849
1850     while (w >= 4)
1851     {
1852         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1853         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1854         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1855
1856         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1857         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1858         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1859
1860         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1861                             &xmm_alpha_lo, &xmm_alpha_hi);
1862
1863         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1864                             &xmm_mask_lo, &xmm_mask_hi,
1865                             &xmm_dst_lo, &xmm_dst_hi);
1866
1867         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1868                             &xmm_alpha_lo, &xmm_alpha_hi,
1869                             &xmm_dst_lo, &xmm_dst_hi);
1870
1871         save_128_aligned (
1872             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1873
1874         ps += 4;
1875         pd += 4;
1876         pm += 4;
1877         w -= 4;
1878     }
1879
1880     while (w)
1881     {
1882         s = *ps++;
1883         m = *pm++;
1884         d = *pd;
1885
1886         *pd++ = pack_1x128_32 (
1887             pix_multiply_1x128 (
1888                 pix_multiply_1x128 (
1889                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1890                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1891
1892         w--;
1893     }
1894 }
1895
1896 static force_inline void
1897 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1898                                  const uint32_t *ps,
1899                                  const uint32_t *pm,
1900                                  int             w)
1901 {
1902     uint32_t s, m, d;
1903
1904     __m128i xmm_alpha_lo, xmm_alpha_hi;
1905     __m128i xmm_src_lo, xmm_src_hi;
1906     __m128i xmm_dst_lo, xmm_dst_hi;
1907     __m128i xmm_mask_lo, xmm_mask_hi;
1908
1909     while (w && (unsigned long)pd & 15)
1910     {
1911         s = *ps++;
1912         m = *pm++;
1913         d = *pd;
1914
1915         *pd++ = pack_1x128_32 (
1916             pix_multiply_1x128 (
1917                 unpack_32_1x128 (d),
1918                 pix_multiply_1x128 (unpack_32_1x128 (m),
1919                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1920         w--;
1921     }
1922
1923     while (w >= 4)
1924     {
1925         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1926         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1927         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1928
1929         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1930         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1931         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1932
1933         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1934                             &xmm_alpha_lo, &xmm_alpha_hi);
1935         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1936                             &xmm_alpha_lo, &xmm_alpha_hi,
1937                             &xmm_alpha_lo, &xmm_alpha_hi);
1938
1939         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1940                             &xmm_alpha_lo, &xmm_alpha_hi,
1941                             &xmm_dst_lo, &xmm_dst_hi);
1942
1943         save_128_aligned (
1944             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1945
1946         ps += 4;
1947         pd += 4;
1948         pm += 4;
1949         w -= 4;
1950     }
1951
1952     while (w)
1953     {
1954         s = *ps++;
1955         m = *pm++;
1956         d = *pd;
1957
1958         *pd++ = pack_1x128_32 (
1959             pix_multiply_1x128 (
1960                 unpack_32_1x128 (d),
1961                 pix_multiply_1x128 (unpack_32_1x128 (m),
1962                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1963         w--;
1964     }
1965 }
1966
1967 static force_inline void
1968 core_combine_out_ca_sse2 (uint32_t *      pd,
1969                           const uint32_t *ps,
1970                           const uint32_t *pm,
1971                           int             w)
1972 {
1973     uint32_t s, m, d;
1974
1975     __m128i xmm_alpha_lo, xmm_alpha_hi;
1976     __m128i xmm_src_lo, xmm_src_hi;
1977     __m128i xmm_dst_lo, xmm_dst_hi;
1978     __m128i xmm_mask_lo, xmm_mask_hi;
1979
1980     while (w && (unsigned long)pd & 15)
1981     {
1982         s = *ps++;
1983         m = *pm++;
1984         d = *pd;
1985
1986         *pd++ = pack_1x128_32 (
1987             pix_multiply_1x128 (
1988                 pix_multiply_1x128 (
1989                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1990                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1991         w--;
1992     }
1993
1994     while (w >= 4)
1995     {
1996         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1997         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1998         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1999
2000         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2001         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2002         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2003
2004         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2005                             &xmm_alpha_lo, &xmm_alpha_hi);
2006         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2007                       &xmm_alpha_lo, &xmm_alpha_hi);
2008
2009         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2010                             &xmm_mask_lo, &xmm_mask_hi,
2011                             &xmm_dst_lo, &xmm_dst_hi);
2012         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2013                             &xmm_alpha_lo, &xmm_alpha_hi,
2014                             &xmm_dst_lo, &xmm_dst_hi);
2015
2016         save_128_aligned (
2017             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2018
2019         ps += 4;
2020         pd += 4;
2021         pm += 4;
2022         w -= 4;
2023     }
2024
2025     while (w)
2026     {
2027         s = *ps++;
2028         m = *pm++;
2029         d = *pd;
2030
2031         *pd++ = pack_1x128_32 (
2032             pix_multiply_1x128 (
2033                 pix_multiply_1x128 (
2034                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
2035                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
2036
2037         w--;
2038     }
2039 }
2040
2041 static force_inline void
2042 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
2043                                   const uint32_t *ps,
2044                                   const uint32_t *pm,
2045                                   int             w)
2046 {
2047     uint32_t s, m, d;
2048
2049     __m128i xmm_alpha_lo, xmm_alpha_hi;
2050     __m128i xmm_src_lo, xmm_src_hi;
2051     __m128i xmm_dst_lo, xmm_dst_hi;
2052     __m128i xmm_mask_lo, xmm_mask_hi;
2053
2054     while (w && (unsigned long)pd & 15)
2055     {
2056         s = *ps++;
2057         m = *pm++;
2058         d = *pd;
2059
2060         *pd++ = pack_1x128_32 (
2061             pix_multiply_1x128 (
2062                 unpack_32_1x128 (d),
2063                 negate_1x128 (pix_multiply_1x128 (
2064                                  unpack_32_1x128 (m),
2065                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
2066         w--;
2067     }
2068
2069     while (w >= 4)
2070     {
2071         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2072         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2073         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2074
2075         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2076         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2077         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2078
2079         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2080                             &xmm_alpha_lo, &xmm_alpha_hi);
2081
2082         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2083                             &xmm_alpha_lo, &xmm_alpha_hi,
2084                             &xmm_mask_lo, &xmm_mask_hi);
2085
2086         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2087                       &xmm_mask_lo, &xmm_mask_hi);
2088
2089         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2090                             &xmm_mask_lo, &xmm_mask_hi,
2091                             &xmm_dst_lo, &xmm_dst_hi);
2092
2093         save_128_aligned (
2094             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2095
2096         ps += 4;
2097         pd += 4;
2098         pm += 4;
2099         w -= 4;
2100     }
2101
2102     while (w)
2103     {
2104         s = *ps++;
2105         m = *pm++;
2106         d = *pd;
2107
2108         *pd++ = pack_1x128_32 (
2109             pix_multiply_1x128 (
2110                 unpack_32_1x128 (d),
2111                 negate_1x128 (pix_multiply_1x128 (
2112                                  unpack_32_1x128 (m),
2113                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
2114         w--;
2115     }
2116 }
2117
2118 static force_inline uint32_t
2119 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2120                                  uint32_t mask,
2121                                  uint32_t dst)
2122 {
2123     __m128i m = unpack_32_1x128 (mask);
2124     __m128i s = unpack_32_1x128 (src);
2125     __m128i d = unpack_32_1x128 (dst);
2126     __m128i sa = expand_alpha_1x128 (s);
2127     __m128i da = expand_alpha_1x128 (d);
2128
2129     s = pix_multiply_1x128 (s, m);
2130     m = negate_1x128 (pix_multiply_1x128 (m, sa));
2131
2132     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2133 }
2134
2135 static force_inline void
2136 core_combine_atop_ca_sse2 (uint32_t *      pd,
2137                            const uint32_t *ps,
2138                            const uint32_t *pm,
2139                            int             w)
2140 {
2141     uint32_t s, m, d;
2142
2143     __m128i xmm_src_lo, xmm_src_hi;
2144     __m128i xmm_dst_lo, xmm_dst_hi;
2145     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2146     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2147     __m128i xmm_mask_lo, xmm_mask_hi;
2148
2149     while (w && (unsigned long)pd & 15)
2150     {
2151         s = *ps++;
2152         m = *pm++;
2153         d = *pd;
2154
2155         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2156         w--;
2157     }
2158
2159     while (w >= 4)
2160     {
2161         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2162         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2163         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2164
2165         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2166         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2167         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2168
2169         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2170                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2171         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2172                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2173
2174         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2175                             &xmm_mask_lo, &xmm_mask_hi,
2176                             &xmm_src_lo, &xmm_src_hi);
2177         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2178                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2179                             &xmm_mask_lo, &xmm_mask_hi);
2180
2181         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2182
2183         pix_add_multiply_2x128 (
2184             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2185             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2186             &xmm_dst_lo, &xmm_dst_hi);
2187
2188         save_128_aligned (
2189             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2190
2191         ps += 4;
2192         pd += 4;
2193         pm += 4;
2194         w -= 4;
2195     }
2196
2197     while (w)
2198     {
2199         s = *ps++;
2200         m = *pm++;
2201         d = *pd;
2202
2203         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2204         w--;
2205     }
2206 }
2207
2208 static force_inline uint32_t
2209 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2210                                          uint32_t mask,
2211                                          uint32_t dst)
2212 {
2213     __m128i m = unpack_32_1x128 (mask);
2214     __m128i s = unpack_32_1x128 (src);
2215     __m128i d = unpack_32_1x128 (dst);
2216
2217     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2218     __m128i sa = expand_alpha_1x128 (s);
2219
2220     s = pix_multiply_1x128 (s, m);
2221     m = pix_multiply_1x128 (m, sa);
2222
2223     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2224 }
2225
2226 static force_inline void
2227 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2228                                    const uint32_t *ps,
2229                                    const uint32_t *pm,
2230                                    int             w)
2231 {
2232     uint32_t s, m, d;
2233
2234     __m128i xmm_src_lo, xmm_src_hi;
2235     __m128i xmm_dst_lo, xmm_dst_hi;
2236     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2237     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2238     __m128i xmm_mask_lo, xmm_mask_hi;
2239
2240     while (w && (unsigned long)pd & 15)
2241     {
2242         s = *ps++;
2243         m = *pm++;
2244         d = *pd;
2245
2246         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2247         w--;
2248     }
2249
2250     while (w >= 4)
2251     {
2252         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2253         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2254         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2255
2256         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2257         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2258         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2259
2260         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2261                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2262         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2263                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2264
2265         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2266                             &xmm_mask_lo, &xmm_mask_hi,
2267                             &xmm_src_lo, &xmm_src_hi);
2268         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2269                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2270                             &xmm_mask_lo, &xmm_mask_hi);
2271
2272         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2273                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2274
2275         pix_add_multiply_2x128 (
2276             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2277             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2278             &xmm_dst_lo, &xmm_dst_hi);
2279
2280         save_128_aligned (
2281             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2282
2283         ps += 4;
2284         pd += 4;
2285         pm += 4;
2286         w -= 4;
2287     }
2288
2289     while (w)
2290     {
2291         s = *ps++;
2292         m = *pm++;
2293         d = *pd;
2294
2295         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2296         w--;
2297     }
2298 }
2299
2300 static force_inline uint32_t
2301 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2302                                 uint32_t mask,
2303                                 uint32_t dst)
2304 {
2305     __m128i a = unpack_32_1x128 (mask);
2306     __m128i s = unpack_32_1x128 (src);
2307     __m128i d = unpack_32_1x128 (dst);
2308
2309     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2310                                        a, expand_alpha_1x128 (s)));
2311     __m128i dest      = pix_multiply_1x128 (s, a);
2312     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2313
2314     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2315                                                 &alpha_dst,
2316                                                 &dest,
2317                                                 &alpha_src));
2318 }
2319
2320 static force_inline void
2321 core_combine_xor_ca_sse2 (uint32_t *      pd,
2322                           const uint32_t *ps,
2323                           const uint32_t *pm,
2324                           int             w)
2325 {
2326     uint32_t s, m, d;
2327
2328     __m128i xmm_src_lo, xmm_src_hi;
2329     __m128i xmm_dst_lo, xmm_dst_hi;
2330     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2331     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2332     __m128i xmm_mask_lo, xmm_mask_hi;
2333
2334     while (w && (unsigned long)pd & 15)
2335     {
2336         s = *ps++;
2337         m = *pm++;
2338         d = *pd;
2339
2340         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2341         w--;
2342     }
2343
2344     while (w >= 4)
2345     {
2346         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2347         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2348         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2349
2350         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2351         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2352         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2353
2354         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2355                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2356         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2357                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2358
2359         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2360                             &xmm_mask_lo, &xmm_mask_hi,
2361                             &xmm_src_lo, &xmm_src_hi);
2362         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2363                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2364                             &xmm_mask_lo, &xmm_mask_hi);
2365
2366         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2367                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2368         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2369                       &xmm_mask_lo, &xmm_mask_hi);
2370
2371         pix_add_multiply_2x128 (
2372             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2373             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2374             &xmm_dst_lo, &xmm_dst_hi);
2375
2376         save_128_aligned (
2377             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2378
2379         ps += 4;
2380         pd += 4;
2381         pm += 4;
2382         w -= 4;
2383     }
2384
2385     while (w)
2386     {
2387         s = *ps++;
2388         m = *pm++;
2389         d = *pd;
2390
2391         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2392         w--;
2393     }
2394 }
2395
2396 static force_inline void
2397 core_combine_add_ca_sse2 (uint32_t *      pd,
2398                           const uint32_t *ps,
2399                           const uint32_t *pm,
2400                           int             w)
2401 {
2402     uint32_t s, m, d;
2403
2404     __m128i xmm_src_lo, xmm_src_hi;
2405     __m128i xmm_dst_lo, xmm_dst_hi;
2406     __m128i xmm_mask_lo, xmm_mask_hi;
2407
2408     while (w && (unsigned long)pd & 15)
2409     {
2410         s = *ps++;
2411         m = *pm++;
2412         d = *pd;
2413
2414         *pd++ = pack_1x128_32 (
2415             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2416                                                unpack_32_1x128 (m)),
2417                            unpack_32_1x128 (d)));
2418         w--;
2419     }
2420
2421     while (w >= 4)
2422     {
2423         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2424         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2425         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2426
2427         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2428         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2429         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2430
2431         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2432                             &xmm_mask_lo, &xmm_mask_hi,
2433                             &xmm_src_lo, &xmm_src_hi);
2434
2435         save_128_aligned (
2436             (__m128i*)pd, pack_2x128_128 (
2437                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2438                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2439
2440         ps += 4;
2441         pd += 4;
2442         pm += 4;
2443         w -= 4;
2444     }
2445
2446     while (w)
2447     {
2448         s = *ps++;
2449         m = *pm++;
2450         d = *pd;
2451
2452         *pd++ = pack_1x128_32 (
2453             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2454                                                unpack_32_1x128 (m)),
2455                            unpack_32_1x128 (d)));
2456         w--;
2457     }
2458 }
2459
2460 /* ---------------------------------------------------
2461  * fb_compose_setup_sSE2
2462  */
2463 static force_inline __m64
2464 create_mask_16_64 (uint16_t mask)
2465 {
2466     return _mm_set1_pi16 (mask);
2467 }
2468
2469 static force_inline __m128i
2470 create_mask_16_128 (uint16_t mask)
2471 {
2472     return _mm_set1_epi16 (mask);
2473 }
2474
2475 static force_inline __m64
2476 create_mask_2x32_64 (uint32_t mask0,
2477                      uint32_t mask1)
2478 {
2479     return _mm_set_pi32 (mask0, mask1);
2480 }
2481
2482 /* Work around a code generation bug in Sun Studio 12. */
2483 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2484 # define create_mask_2x32_128(mask0, mask1)                             \
2485     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2486 #else
2487 static force_inline __m128i
2488 create_mask_2x32_128 (uint32_t mask0,
2489                       uint32_t mask1)
2490 {
2491     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2492 }
2493 #endif
2494
2495 /* SSE2 code patch for fbcompose.c */
2496
2497 static void
2498 sse2_combine_over_u (pixman_implementation_t *imp,
2499                      pixman_op_t              op,
2500                      uint32_t *               dst,
2501                      const uint32_t *         src,
2502                      const uint32_t *         mask,
2503                      int                      width)
2504 {
2505     core_combine_over_u_sse2 (dst, src, mask, width);
2506     _mm_empty ();
2507 }
2508
2509 static void
2510 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2511                              pixman_op_t              op,
2512                              uint32_t *               dst,
2513                              const uint32_t *         src,
2514                              const uint32_t *         mask,
2515                              int                      width)
2516 {
2517     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2518     _mm_empty ();
2519 }
2520
2521 static void
2522 sse2_combine_in_u (pixman_implementation_t *imp,
2523                    pixman_op_t              op,
2524                    uint32_t *               dst,
2525                    const uint32_t *         src,
2526                    const uint32_t *         mask,
2527                    int                      width)
2528 {
2529     core_combine_in_u_sse2 (dst, src, mask, width);
2530     _mm_empty ();
2531 }
2532
2533 static void
2534 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2535                            pixman_op_t              op,
2536                            uint32_t *               dst,
2537                            const uint32_t *         src,
2538                            const uint32_t *         mask,
2539                            int                      width)
2540 {
2541     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2542     _mm_empty ();
2543 }
2544
2545 static void
2546 sse2_combine_out_u (pixman_implementation_t *imp,
2547                     pixman_op_t              op,
2548                     uint32_t *               dst,
2549                     const uint32_t *         src,
2550                     const uint32_t *         mask,
2551                     int                      width)
2552 {
2553     core_combine_out_u_sse2 (dst, src, mask, width);
2554     _mm_empty ();
2555 }
2556
2557 static void
2558 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2559                             pixman_op_t              op,
2560                             uint32_t *               dst,
2561                             const uint32_t *         src,
2562                             const uint32_t *         mask,
2563                             int                      width)
2564 {
2565     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2566     _mm_empty ();
2567 }
2568
2569 static void
2570 sse2_combine_atop_u (pixman_implementation_t *imp,
2571                      pixman_op_t              op,
2572                      uint32_t *               dst,
2573                      const uint32_t *         src,
2574                      const uint32_t *         mask,
2575                      int                      width)
2576 {
2577     core_combine_atop_u_sse2 (dst, src, mask, width);
2578     _mm_empty ();
2579 }
2580
2581 static void
2582 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2583                              pixman_op_t              op,
2584                              uint32_t *               dst,
2585                              const uint32_t *         src,
2586                              const uint32_t *         mask,
2587                              int                      width)
2588 {
2589     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2590     _mm_empty ();
2591 }
2592
2593 static void
2594 sse2_combine_xor_u (pixman_implementation_t *imp,
2595                     pixman_op_t              op,
2596                     uint32_t *               dst,
2597                     const uint32_t *         src,
2598                     const uint32_t *         mask,
2599                     int                      width)
2600 {
2601     core_combine_xor_u_sse2 (dst, src, mask, width);
2602     _mm_empty ();
2603 }
2604
2605 static void
2606 sse2_combine_add_u (pixman_implementation_t *imp,
2607                     pixman_op_t              op,
2608                     uint32_t *               dst,
2609                     const uint32_t *         src,
2610                     const uint32_t *         mask,
2611                     int                      width)
2612 {
2613     core_combine_add_u_sse2 (dst, src, mask, width);
2614     _mm_empty ();
2615 }
2616
2617 static void
2618 sse2_combine_saturate_u (pixman_implementation_t *imp,
2619                          pixman_op_t              op,
2620                          uint32_t *               dst,
2621                          const uint32_t *         src,
2622                          const uint32_t *         mask,
2623                          int                      width)
2624 {
2625     core_combine_saturate_u_sse2 (dst, src, mask, width);
2626     _mm_empty ();
2627 }
2628
2629 static void
2630 sse2_combine_src_ca (pixman_implementation_t *imp,
2631                      pixman_op_t              op,
2632                      uint32_t *               dst,
2633                      const uint32_t *         src,
2634                      const uint32_t *         mask,
2635                      int                      width)
2636 {
2637     core_combine_src_ca_sse2 (dst, src, mask, width);
2638     _mm_empty ();
2639 }
2640
2641 static void
2642 sse2_combine_over_ca (pixman_implementation_t *imp,
2643                       pixman_op_t              op,
2644                       uint32_t *               dst,
2645                       const uint32_t *         src,
2646                       const uint32_t *         mask,
2647                       int                      width)
2648 {
2649     core_combine_over_ca_sse2 (dst, src, mask, width);
2650     _mm_empty ();
2651 }
2652
2653 static void
2654 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2655                               pixman_op_t              op,
2656                               uint32_t *               dst,
2657                               const uint32_t *         src,
2658                               const uint32_t *         mask,
2659                               int                      width)
2660 {
2661     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2662     _mm_empty ();
2663 }
2664
2665 static void
2666 sse2_combine_in_ca (pixman_implementation_t *imp,
2667                     pixman_op_t              op,
2668                     uint32_t *               dst,
2669                     const uint32_t *         src,
2670                     const uint32_t *         mask,
2671                     int                      width)
2672 {
2673     core_combine_in_ca_sse2 (dst, src, mask, width);
2674     _mm_empty ();
2675 }
2676
2677 static void
2678 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2679                             pixman_op_t              op,
2680                             uint32_t *               dst,
2681                             const uint32_t *         src,
2682                             const uint32_t *         mask,
2683                             int                      width)
2684 {
2685     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2686     _mm_empty ();
2687 }
2688
2689 static void
2690 sse2_combine_out_ca (pixman_implementation_t *imp,
2691                      pixman_op_t              op,
2692                      uint32_t *               dst,
2693                      const uint32_t *         src,
2694                      const uint32_t *         mask,
2695                      int                      width)
2696 {
2697     core_combine_out_ca_sse2 (dst, src, mask, width);
2698     _mm_empty ();
2699 }
2700
2701 static void
2702 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2703                              pixman_op_t              op,
2704                              uint32_t *               dst,
2705                              const uint32_t *         src,
2706                              const uint32_t *         mask,
2707                              int                      width)
2708 {
2709     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2710     _mm_empty ();
2711 }
2712
2713 static void
2714 sse2_combine_atop_ca (pixman_implementation_t *imp,
2715                       pixman_op_t              op,
2716                       uint32_t *               dst,
2717                       const uint32_t *         src,
2718                       const uint32_t *         mask,
2719                       int                      width)
2720 {
2721     core_combine_atop_ca_sse2 (dst, src, mask, width);
2722     _mm_empty ();
2723 }
2724
2725 static void
2726 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2727                               pixman_op_t              op,
2728                               uint32_t *               dst,
2729                               const uint32_t *         src,
2730                               const uint32_t *         mask,
2731                               int                      width)
2732 {
2733     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2734     _mm_empty ();
2735 }
2736
2737 static void
2738 sse2_combine_xor_ca (pixman_implementation_t *imp,
2739                      pixman_op_t              op,
2740                      uint32_t *               dst,
2741                      const uint32_t *         src,
2742                      const uint32_t *         mask,
2743                      int                      width)
2744 {
2745     core_combine_xor_ca_sse2 (dst, src, mask, width);
2746     _mm_empty ();
2747 }
2748
2749 static void
2750 sse2_combine_add_ca (pixman_implementation_t *imp,
2751                      pixman_op_t              op,
2752                      uint32_t *               dst,
2753                      const uint32_t *         src,
2754                      const uint32_t *         mask,
2755                      int                      width)
2756 {
2757     core_combine_add_ca_sse2 (dst, src, mask, width);
2758     _mm_empty ();
2759 }
2760
2761 /* -------------------------------------------------------------------
2762  * composite_over_n_8888
2763  */
2764
2765 static void
2766 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2767                             pixman_op_t              op,
2768                             pixman_image_t *         src_image,
2769                             pixman_image_t *         mask_image,
2770                             pixman_image_t *         dst_image,
2771                             int32_t                  src_x,
2772                             int32_t                  src_y,
2773                             int32_t                  mask_x,
2774                             int32_t                  mask_y,
2775                             int32_t                  dest_x,
2776                             int32_t                  dest_y,
2777                             int32_t                  width,
2778                             int32_t                  height)
2779 {
2780     uint32_t src;
2781     uint32_t    *dst_line, *dst, d;
2782     int32_t w;
2783     int dst_stride;
2784     __m128i xmm_src, xmm_alpha;
2785     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2786
2787     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2788
2789     if (src == 0)
2790         return;
2791
2792     PIXMAN_IMAGE_GET_LINE (
2793         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2794
2795     xmm_src = expand_pixel_32_1x128 (src);
2796     xmm_alpha = expand_alpha_1x128 (xmm_src);
2797
2798     while (height--)
2799     {
2800         dst = dst_line;
2801
2802         dst_line += dst_stride;
2803         w = width;
2804
2805         while (w && (unsigned long)dst & 15)
2806         {
2807             d = *dst;
2808             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2809                                                 xmm_alpha,
2810                                                 unpack_32_1x128 (d)));
2811             w--;
2812         }
2813
2814         while (w >= 4)
2815         {
2816             xmm_dst = load_128_aligned ((__m128i*)dst);
2817
2818             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2819
2820             over_2x128 (&xmm_src, &xmm_src,
2821                         &xmm_alpha, &xmm_alpha,
2822                         &xmm_dst_lo, &xmm_dst_hi);
2823
2824             /* rebuid the 4 pixel data and save*/
2825             save_128_aligned (
2826                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2827
2828             w -= 4;
2829             dst += 4;
2830         }
2831
2832         while (w)
2833         {
2834             d = *dst;
2835             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2836                                                 xmm_alpha,
2837                                                 unpack_32_1x128 (d)));
2838             w--;
2839         }
2840
2841     }
2842     _mm_empty ();
2843 }
2844
2845 /* ---------------------------------------------------------------------
2846  * composite_over_n_0565
2847  */
2848 static void
2849 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2850                             pixman_op_t              op,
2851                             pixman_image_t *         src_image,
2852                             pixman_image_t *         mask_image,
2853                             pixman_image_t *         dst_image,
2854                             int32_t                  src_x,
2855                             int32_t                  src_y,
2856                             int32_t                  mask_x,
2857                             int32_t                  mask_y,
2858                             int32_t                  dest_x,
2859                             int32_t                  dest_y,
2860                             int32_t                  width,
2861                             int32_t                  height)
2862 {
2863     uint32_t src;
2864     uint16_t    *dst_line, *dst, d;
2865     int32_t w;
2866     int dst_stride;
2867     __m128i xmm_src, xmm_alpha;
2868     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2869
2870     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2871
2872     if (src == 0)
2873         return;
2874
2875     PIXMAN_IMAGE_GET_LINE (
2876         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2877
2878     xmm_src = expand_pixel_32_1x128 (src);
2879     xmm_alpha = expand_alpha_1x128 (xmm_src);
2880
2881     while (height--)
2882     {
2883         dst = dst_line;
2884
2885         dst_line += dst_stride;
2886         w = width;
2887
2888         while (w && (unsigned long)dst & 15)
2889         {
2890             d = *dst;
2891
2892             *dst++ = pack_565_32_16 (
2893                 pack_1x128_32 (over_1x128 (xmm_src,
2894                                            xmm_alpha,
2895                                            expand565_16_1x128 (d))));
2896             w--;
2897         }
2898
2899         while (w >= 8)
2900         {
2901             xmm_dst = load_128_aligned ((__m128i*)dst);
2902
2903             unpack_565_128_4x128 (xmm_dst,
2904                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2905
2906             over_2x128 (&xmm_src, &xmm_src,
2907                         &xmm_alpha, &xmm_alpha,
2908                         &xmm_dst0, &xmm_dst1);
2909             over_2x128 (&xmm_src, &xmm_src,
2910                         &xmm_alpha, &xmm_alpha,
2911                         &xmm_dst2, &xmm_dst3);
2912
2913             xmm_dst = pack_565_4x128_128 (
2914                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2915
2916             save_128_aligned ((__m128i*)dst, xmm_dst);
2917
2918             dst += 8;
2919             w -= 8;
2920         }
2921
2922         while (w--)
2923         {
2924             d = *dst;
2925             *dst++ = pack_565_32_16 (
2926                 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2927                                            expand565_16_1x128 (d))));
2928         }
2929     }
2930
2931     _mm_empty ();
2932 }
2933
2934 /* ------------------------------
2935  * composite_add_n_8888_8888_ca
2936  */
2937 static void
2938 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2939                                    pixman_op_t              op,
2940                                    pixman_image_t *         src_image,
2941                                    pixman_image_t *         mask_image,
2942                                    pixman_image_t *         dst_image,
2943                                    int32_t                  src_x,
2944                                    int32_t                  src_y,
2945                                    int32_t                  mask_x,
2946                                    int32_t                  mask_y,
2947                                    int32_t                  dest_x,
2948                                    int32_t                  dest_y,
2949                                    int32_t                  width,
2950                                    int32_t                  height)
2951 {
2952     uint32_t src, srca;
2953     uint32_t    *dst_line, d;
2954     uint32_t    *mask_line, m;
2955     uint32_t pack_cmp;
2956     int dst_stride, mask_stride;
2957
2958     __m128i xmm_src, xmm_alpha;
2959     __m128i xmm_dst;
2960     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2961
2962     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2963
2964     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2965     srca = src >> 24;
2966
2967     if (src == 0)
2968         return;
2969
2970     PIXMAN_IMAGE_GET_LINE (
2971         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2972     PIXMAN_IMAGE_GET_LINE (
2973         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2974
2975     xmm_src = _mm_unpacklo_epi8 (
2976         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2977     xmm_alpha = expand_alpha_1x128 (xmm_src);
2978     mmx_src   = xmm_src;
2979     mmx_alpha = xmm_alpha;
2980
2981     while (height--)
2982     {
2983         int w = width;
2984         const uint32_t *pm = (uint32_t *)mask_line;
2985         uint32_t *pd = (uint32_t *)dst_line;
2986
2987         dst_line += dst_stride;
2988         mask_line += mask_stride;
2989
2990         while (w && (unsigned long)pd & 15)
2991         {
2992             m = *pm++;
2993
2994             if (m)
2995             {
2996                 d = *pd;
2997
2998                 mmx_mask = unpack_32_1x128 (m);
2999                 mmx_dest = unpack_32_1x128 (d);
3000
3001                 *pd = pack_1x128_32 (
3002                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
3003             }
3004
3005             pd++;
3006             w--;
3007         }
3008
3009         while (w >= 4)
3010         {
3011             xmm_mask = load_128_unaligned ((__m128i*)pm);
3012
3013             pack_cmp =
3014                 _mm_movemask_epi8 (
3015                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3016
3017             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3018             if (pack_cmp != 0xffff)
3019             {
3020                 xmm_dst = load_128_aligned ((__m128i*)pd);
3021
3022                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3023
3024                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3025                                     &xmm_mask_lo, &xmm_mask_hi,
3026                                     &xmm_mask_lo, &xmm_mask_hi);
3027                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3028
3029                 save_128_aligned (
3030                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3031             }
3032
3033             pd += 4;
3034             pm += 4;
3035             w -= 4;
3036         }
3037
3038         while (w)
3039         {
3040             m = *pm++;
3041
3042             if (m)
3043             {
3044                 d = *pd;
3045
3046                 mmx_mask = unpack_32_1x128 (m);
3047                 mmx_dest = unpack_32_1x128 (d);
3048
3049                 *pd = pack_1x128_32 (
3050                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
3051             }
3052
3053             pd++;
3054             w--;
3055         }
3056     }
3057
3058     _mm_empty ();
3059 }
3060
3061 /* ---------------------------------------------------------------------------
3062  * composite_over_n_8888_8888_ca
3063  */
3064
3065 static void
3066 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3067                                     pixman_op_t              op,
3068                                     pixman_image_t *         src_image,
3069                                     pixman_image_t *         mask_image,
3070                                     pixman_image_t *         dst_image,
3071                                     int32_t                  src_x,
3072                                     int32_t                  src_y,
3073                                     int32_t                  mask_x,
3074                                     int32_t                  mask_y,
3075                                     int32_t                  dest_x,
3076                                     int32_t                  dest_y,
3077                                     int32_t                  width,
3078                                     int32_t                  height)
3079 {
3080     uint32_t src;
3081     uint32_t    *dst_line, d;
3082     uint32_t    *mask_line, m;
3083     uint32_t pack_cmp;
3084     int dst_stride, mask_stride;
3085
3086     __m128i xmm_src, xmm_alpha;
3087     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3088     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3089
3090     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3091
3092     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3093
3094     if (src == 0)
3095         return;
3096
3097     PIXMAN_IMAGE_GET_LINE (
3098         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3099     PIXMAN_IMAGE_GET_LINE (
3100         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3101
3102     xmm_src = _mm_unpacklo_epi8 (
3103         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3104     xmm_alpha = expand_alpha_1x128 (xmm_src);
3105     mmx_src   = xmm_src;
3106     mmx_alpha = xmm_alpha;
3107
3108     while (height--)
3109     {
3110         int w = width;
3111         const uint32_t *pm = (uint32_t *)mask_line;
3112         uint32_t *pd = (uint32_t *)dst_line;
3113
3114         dst_line += dst_stride;
3115         mask_line += mask_stride;
3116
3117         while (w && (unsigned long)pd & 15)
3118         {
3119             m = *pm++;
3120
3121             if (m)
3122             {
3123                 d = *pd;
3124                 mmx_mask = unpack_32_1x128 (m);
3125                 mmx_dest = unpack_32_1x128 (d);
3126
3127                 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
3128                                                   &mmx_alpha,
3129                                                   &mmx_mask,
3130                                                   &mmx_dest));
3131             }
3132
3133             pd++;
3134             w--;
3135         }
3136
3137         while (w >= 4)
3138         {
3139             xmm_mask = load_128_unaligned ((__m128i*)pm);
3140
3141             pack_cmp =
3142                 _mm_movemask_epi8 (
3143                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3144
3145             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3146             if (pack_cmp != 0xffff)
3147             {
3148                 xmm_dst = load_128_aligned ((__m128i*)pd);
3149
3150                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3151                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3152
3153                 in_over_2x128 (&xmm_src, &xmm_src,
3154                                &xmm_alpha, &xmm_alpha,
3155                                &xmm_mask_lo, &xmm_mask_hi,
3156                                &xmm_dst_lo, &xmm_dst_hi);
3157
3158                 save_128_aligned (
3159                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3160             }
3161
3162             pd += 4;
3163             pm += 4;
3164             w -= 4;
3165         }
3166
3167         while (w)
3168         {
3169             m = *pm++;
3170
3171             if (m)
3172             {
3173                 d = *pd;
3174                 mmx_mask = unpack_32_1x128 (m);
3175                 mmx_dest = unpack_32_1x128 (d);
3176
3177                 *pd = pack_1x128_32 (
3178                     in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3179             }
3180
3181             pd++;
3182             w--;
3183         }
3184     }
3185
3186     _mm_empty ();
3187 }
3188
3189 /*---------------------------------------------------------------------
3190  * composite_over_8888_n_8888
3191  */
3192
3193 static void
3194 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3195                                  pixman_op_t              op,
3196                                  pixman_image_t *         src_image,
3197                                  pixman_image_t *         mask_image,
3198                                  pixman_image_t *         dst_image,
3199                                  int32_t                  src_x,
3200                                  int32_t                  src_y,
3201                                  int32_t                  mask_x,
3202                                  int32_t                  mask_y,
3203                                  int32_t                  dest_x,
3204                                  int32_t                  dest_y,
3205                                  int32_t                  width,
3206                                  int32_t                  height)
3207 {
3208     uint32_t    *dst_line, *dst;
3209     uint32_t    *src_line, *src;
3210     uint32_t mask;
3211     int32_t w;
3212     int dst_stride, src_stride;
3213
3214     __m128i xmm_mask;
3215     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3216     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3217     __m128i xmm_alpha_lo, xmm_alpha_hi;
3218
3219     PIXMAN_IMAGE_GET_LINE (
3220         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3221     PIXMAN_IMAGE_GET_LINE (
3222         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3223
3224     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3225
3226     xmm_mask = create_mask_16_128 (mask >> 24);
3227
3228     while (height--)
3229     {
3230         dst = dst_line;
3231         dst_line += dst_stride;
3232         src = src_line;
3233         src_line += src_stride;
3234         w = width;
3235
3236         while (w && (unsigned long)dst & 15)
3237         {
3238             uint32_t s = *src++;
3239
3240             if (s)
3241             {
3242                 uint32_t d = *dst;
3243
3244                 __m128i ms = unpack_32_1x128 (s);
3245                 __m128i alpha    = expand_alpha_1x128 (ms);
3246                 __m128i dest     = xmm_mask;
3247                 __m128i alpha_dst = unpack_32_1x128 (d);
3248
3249                 *dst = pack_1x128_32 (
3250                     in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
3251             }
3252             dst++;
3253             w--;
3254         }
3255
3256         while (w >= 4)
3257         {
3258             xmm_src = load_128_unaligned ((__m128i*)src);
3259
3260             if (!is_zero (xmm_src))
3261             {
3262                 xmm_dst = load_128_aligned ((__m128i*)dst);
3263
3264                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3265                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3266                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3267                                     &xmm_alpha_lo, &xmm_alpha_hi);
3268
3269                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3270                                &xmm_alpha_lo, &xmm_alpha_hi,
3271                                &xmm_mask, &xmm_mask,
3272                                &xmm_dst_lo, &xmm_dst_hi);
3273
3274                 save_128_aligned (
3275                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3276             }
3277
3278             dst += 4;
3279             src += 4;
3280             w -= 4;
3281         }
3282
3283         while (w)
3284         {
3285             uint32_t s = *src++;
3286
3287             if (s)
3288             {
3289                 uint32_t d = *dst;
3290
3291                 __m128i ms = unpack_32_1x128 (s);
3292                 __m128i alpha = expand_alpha_1x128 (ms);
3293                 __m128i mask  = xmm_mask;
3294                 __m128i dest  = unpack_32_1x128 (d);
3295
3296                 *dst = pack_1x128_32 (
3297                     in_over_1x128 (&ms, &alpha, &mask, &dest));
3298             }
3299
3300             dst++;
3301             w--;
3302         }
3303     }
3304
3305     _mm_empty ();
3306 }
3307
3308 /*---------------------------------------------------------------------
3309  * composite_over_8888_n_8888
3310  */
3311
3312 static void
3313 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
3314                               pixman_op_t              op,
3315                               pixman_image_t *         src_image,
3316                               pixman_image_t *         mask_image,
3317                               pixman_image_t *         dst_image,
3318                               int32_t                  src_x,
3319                               int32_t                  src_y,
3320                               int32_t                  mask_x,
3321                               int32_t                  mask_y,
3322                               int32_t                  dest_x,
3323                               int32_t                  dest_y,
3324                               int32_t                  width,
3325                               int32_t                  height)
3326 {
3327     uint32_t    *dst_line, *dst;
3328     uint32_t    *src_line, *src;
3329     int32_t w;
3330     int dst_stride, src_stride;
3331
3332
3333     PIXMAN_IMAGE_GET_LINE (
3334         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3335     PIXMAN_IMAGE_GET_LINE (
3336         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3337
3338     while (height--)
3339     {
3340         dst = dst_line;
3341         dst_line += dst_stride;
3342         src = src_line;
3343         src_line += src_stride;
3344         w = width;
3345
3346         while (w && (unsigned long)dst & 15)
3347         {
3348             *dst++ = *src++ | 0xff000000;
3349             w--;
3350         }
3351
3352         while (w >= 16)
3353         {
3354             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
3355
3356             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
3357             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
3358             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
3359             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
3360
3361             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
3362             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
3363             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
3364             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
3365
3366             dst += 16;
3367             src += 16;
3368             w -= 16;
3369         }
3370
3371         while (w)
3372         {
3373             *dst++ = *src++ | 0xff000000;
3374             w--;
3375         }
3376     }
3377
3378     _mm_empty ();
3379 }
3380
3381 /* ---------------------------------------------------------------------
3382  * composite_over_x888_n_8888
3383  */
3384 static void
3385 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3386                                  pixman_op_t              op,
3387                                  pixman_image_t *         src_image,
3388                                  pixman_image_t *         mask_image,
3389                                  pixman_image_t *         dst_image,
3390                                  int32_t                  src_x,
3391                                  int32_t                  src_y,
3392                                  int32_t                  mask_x,
3393                                  int32_t                  mask_y,
3394                                  int32_t                  dest_x,
3395                                  int32_t                  dest_y,
3396                                  int32_t                  width,
3397                                  int32_t                  height)
3398 {
3399     uint32_t    *dst_line, *dst;
3400     uint32_t    *src_line, *src;
3401     uint32_t mask;
3402     int dst_stride, src_stride;
3403     int32_t w;
3404
3405     __m128i xmm_mask, xmm_alpha;
3406     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3407     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3408
3409     PIXMAN_IMAGE_GET_LINE (
3410         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3411     PIXMAN_IMAGE_GET_LINE (
3412         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3413
3414     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3415
3416     xmm_mask = create_mask_16_128 (mask >> 24);
3417     xmm_alpha = mask_00ff;
3418
3419     while (height--)
3420     {
3421         dst = dst_line;
3422         dst_line += dst_stride;
3423         src = src_line;
3424         src_line += src_stride;
3425         w = width;
3426
3427         while (w && (unsigned long)dst & 15)
3428         {
3429             uint32_t s = (*src++) | 0xff000000;
3430             uint32_t d = *dst;
3431
3432             __m128i src   = unpack_32_1x128 (s);
3433             __m128i alpha = xmm_alpha;
3434             __m128i mask  = xmm_mask;
3435             __m128i dest  = unpack_32_1x128 (d);
3436
3437             *dst++ = pack_1x128_32 (
3438                 in_over_1x128 (&src, &alpha, &mask, &dest));
3439
3440             w--;
3441         }
3442
3443         while (w >= 4)
3444         {
3445             xmm_src = _mm_or_si128 (
3446                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3447             xmm_dst = load_128_aligned ((__m128i*)dst);
3448
3449             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3450             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3451
3452             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3453                            &xmm_alpha, &xmm_alpha,
3454                            &xmm_mask, &xmm_mask,
3455                            &xmm_dst_lo, &xmm_dst_hi);
3456
3457             save_128_aligned (
3458                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3459
3460             dst += 4;
3461             src += 4;
3462             w -= 4;
3463
3464         }
3465
3466         while (w)
3467         {
3468             uint32_t s = (*src++) | 0xff000000;
3469             uint32_t d = *dst;
3470
3471             __m128i src  = unpack_32_1x128 (s);
3472             __m128i alpha = xmm_alpha;
3473             __m128i mask  = xmm_mask;
3474             __m128i dest  = unpack_32_1x128 (d);
3475
3476             *dst++ = pack_1x128_32 (
3477                 in_over_1x128 (&src, &alpha, &mask, &dest));
3478
3479             w--;
3480         }
3481     }
3482
3483     _mm_empty ();
3484 }
3485
3486 /* --------------------------------------------------------------------
3487  * composite_over_8888_8888
3488  */
3489 static void
3490 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3491                                pixman_op_t              op,
3492                                pixman_image_t *         src_image,
3493                                pixman_image_t *         mask_image,
3494                                pixman_image_t *         dst_image,
3495                                int32_t                  src_x,
3496                                int32_t                  src_y,
3497                                int32_t                  mask_x,
3498                                int32_t                  mask_y,
3499                                int32_t                  dest_x,
3500                                int32_t                  dest_y,
3501                                int32_t                  width,
3502                                int32_t                  height)
3503 {
3504     int dst_stride, src_stride;
3505     uint32_t    *dst_line, *dst;
3506     uint32_t    *src_line, *src;
3507
3508     PIXMAN_IMAGE_GET_LINE (
3509         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3510     PIXMAN_IMAGE_GET_LINE (
3511         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3512
3513     dst = dst_line;
3514     src = src_line;
3515
3516     while (height--)
3517     {
3518         core_combine_over_u_sse2 (dst, src, NULL, width);
3519
3520         dst += dst_stride;
3521         src += src_stride;
3522     }
3523     _mm_empty ();
3524 }
3525
3526 /* ------------------------------------------------------------------
3527  * composite_over_8888_0565
3528  */
3529 static force_inline uint16_t
3530 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3531 {
3532     __m128i ms;
3533
3534     ms = unpack_32_1x128 (src);
3535     return pack_565_32_16 (
3536         pack_1x128_32 (
3537             over_1x128 (
3538                 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3539 }
3540
3541 static void
3542 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3543                                pixman_op_t              op,
3544                                pixman_image_t *         src_image,
3545                                pixman_image_t *         mask_image,
3546                                pixman_image_t *         dst_image,
3547                                int32_t                  src_x,
3548                                int32_t                  src_y,
3549                                int32_t                  mask_x,
3550                                int32_t                  mask_y,
3551                                int32_t                  dest_x,
3552                                int32_t                  dest_y,
3553                                int32_t                  width,
3554                                int32_t                  height)
3555 {
3556     uint16_t    *dst_line, *dst, d;
3557     uint32_t    *src_line, *src, s;
3558     int dst_stride, src_stride;
3559     int32_t w;
3560
3561     __m128i xmm_alpha_lo, xmm_alpha_hi;
3562     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3563     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3564
3565     PIXMAN_IMAGE_GET_LINE (
3566         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3567     PIXMAN_IMAGE_GET_LINE (
3568         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3569
3570 #if 0
3571     /* FIXME
3572      *
3573      * I copy the code from MMX one and keep the fixme.
3574      * If it's a problem there, probably is a problem here.
3575      */
3576     assert (src_image->drawable == mask_image->drawable);
3577 #endif
3578
3579     while (height--)
3580     {
3581         dst = dst_line;
3582         src = src_line;
3583
3584         dst_line += dst_stride;
3585         src_line += src_stride;
3586         w = width;
3587
3588         /* Align dst on a 16-byte boundary */
3589         while (w &&
3590                ((unsigned long)dst & 15))
3591         {
3592             s = *src++;
3593             d = *dst;
3594
3595             *dst++ = composite_over_8888_0565pixel (s, d);
3596             w--;
3597         }
3598
3599         /* It's a 8 pixel loop */
3600         while (w >= 8)
3601         {
3602             /* I'm loading unaligned because I'm not sure
3603              * about the address alignment.
3604              */
3605             xmm_src = load_128_unaligned ((__m128i*) src);
3606             xmm_dst = load_128_aligned ((__m128i*) dst);
3607
3608             /* Unpacking */
3609             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3610             unpack_565_128_4x128 (xmm_dst,
3611                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3612             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3613                                 &xmm_alpha_lo, &xmm_alpha_hi);
3614
3615             /* I'm loading next 4 pixels from memory
3616              * before to optimze the memory read.
3617              */
3618             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3619
3620             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3621                         &xmm_alpha_lo, &xmm_alpha_hi,
3622                         &xmm_dst0, &xmm_dst1);
3623
3624             /* Unpacking */
3625             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3626             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3627                                 &xmm_alpha_lo, &xmm_alpha_hi);
3628
3629             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3630                         &xmm_alpha_lo, &xmm_alpha_hi,
3631                         &xmm_dst2, &xmm_dst3);
3632
3633             save_128_aligned (
3634                 (__m128i*)dst, pack_565_4x128_128 (
3635                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3636
3637             w -= 8;
3638             dst += 8;
3639             src += 8;
3640         }
3641
3642         while (w--)
3643         {
3644             s = *src++;
3645             d = *dst;
3646
3647             *dst++ = composite_over_8888_0565pixel (s, d);
3648         }
3649     }
3650
3651     _mm_empty ();
3652 }
3653
3654 /* -----------------------------------------------------------------
3655  * composite_over_n_8_8888
3656  */
3657
3658 static void
3659 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3660                               pixman_op_t              op,
3661                               pixman_image_t *         src_image,
3662                               pixman_image_t *         mask_image,
3663                               pixman_image_t *         dst_image,
3664                               int32_t                  src_x,
3665                               int32_t                  src_y,
3666                               int32_t                  mask_x,
3667                               int32_t                  mask_y,
3668                               int32_t                  dest_x,
3669                               int32_t                  dest_y,
3670                               int32_t                  width,
3671                               int32_t                  height)
3672 {
3673     uint32_t src, srca;
3674     uint32_t *dst_line, *dst;
3675     uint8_t *mask_line, *mask;
3676     int dst_stride, mask_stride;
3677     int32_t w;
3678     uint32_t m, d;
3679
3680     __m128i xmm_src, xmm_alpha, xmm_def;
3681     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3682     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3683
3684     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3685
3686     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3687
3688     srca = src >> 24;
3689     if (src == 0)
3690         return;
3691
3692     PIXMAN_IMAGE_GET_LINE (
3693         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3694     PIXMAN_IMAGE_GET_LINE (
3695         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3696
3697     xmm_def = create_mask_2x32_128 (src, src);
3698     xmm_src = expand_pixel_32_1x128 (src);
3699     xmm_alpha = expand_alpha_1x128 (xmm_src);
3700     mmx_src   = xmm_src;
3701     mmx_alpha = xmm_alpha;
3702
3703     while (height--)
3704     {
3705         dst = dst_line;
3706         dst_line += dst_stride;
3707         mask = mask_line;
3708         mask_line += mask_stride;
3709         w = width;
3710
3711         while (w && (unsigned long)dst & 15)
3712         {
3713             uint8_t m = *mask++;
3714
3715             if (m)
3716             {
3717                 d = *dst;
3718                 mmx_mask = expand_pixel_8_1x128 (m);
3719                 mmx_dest = unpack_32_1x128 (d);
3720
3721                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3722                                                    &mmx_alpha,
3723                                                    &mmx_mask,
3724                                                    &mmx_dest));
3725             }
3726
3727             w--;
3728             dst++;
3729         }
3730
3731         while (w >= 4)
3732         {
3733             m = *((uint32_t*)mask);
3734
3735             if (srca == 0xff && m == 0xffffffff)
3736             {
3737                 save_128_aligned ((__m128i*)dst, xmm_def);
3738             }
3739             else if (m)
3740             {
3741                 xmm_dst = load_128_aligned ((__m128i*) dst);
3742                 xmm_mask = unpack_32_1x128 (m);
3743                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3744
3745                 /* Unpacking */
3746                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3747                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3748
3749                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3750                                         &xmm_mask_lo, &xmm_mask_hi);
3751
3752                 in_over_2x128 (&xmm_src, &xmm_src,
3753                                &xmm_alpha, &xmm_alpha,
3754                                &xmm_mask_lo, &xmm_mask_hi,
3755                                &xmm_dst_lo, &xmm_dst_hi);
3756
3757                 save_128_aligned (
3758                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3759             }
3760
3761             w -= 4;
3762             dst += 4;
3763             mask += 4;
3764         }
3765
3766         while (w)
3767         {
3768             uint8_t m = *mask++;
3769
3770             if (m)
3771             {
3772                 d = *dst;
3773                 mmx_mask = expand_pixel_8_1x128 (m);
3774                 mmx_dest = unpack_32_1x128 (d);
3775
3776                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3777                                                    &mmx_alpha,
3778                                                    &mmx_mask,
3779                                                    &mmx_dest));
3780             }
3781
3782             w--;
3783             dst++;
3784         }
3785     }
3786
3787     _mm_empty ();
3788 }
3789
3790 /* ----------------------------------------------------------------
3791  * composite_over_n_8_8888
3792  */
3793
3794 pixman_bool_t
3795 pixman_fill_sse2 (uint32_t *bits,
3796                   int       stride,
3797                   int       bpp,
3798                   int       x,
3799                   int       y,
3800                   int       width,
3801                   int       height,
3802                   uint32_t  data)
3803 {
3804     uint32_t byte_width;
3805     uint8_t         *byte_line;
3806
3807     __m128i xmm_def;
3808
3809     if (bpp == 8)
3810     {
3811         uint8_t b;
3812         uint16_t w;
3813
3814         stride = stride * (int) sizeof (uint32_t) / 1;
3815         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3816         byte_width = width;
3817         stride *= 1;
3818
3819         b = data & 0xff;
3820         w = (b << 8) | b;
3821         data = (w << 16) | w;
3822     }
3823     else if (bpp == 16)
3824     {
3825         stride = stride * (int) sizeof (uint32_t) / 2;
3826         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3827         byte_width = 2 * width;
3828         stride *= 2;
3829
3830         data = (data & 0xffff) * 0x00010001;
3831     }
3832     else if (bpp == 32)
3833     {
3834         stride = stride * (int) sizeof (uint32_t) / 4;
3835         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3836         byte_width = 4 * width;
3837         stride *= 4;
3838     }
3839     else
3840     {
3841         return FALSE;
3842     }
3843
3844     xmm_def = create_mask_2x32_128 (data, data);
3845
3846     while (height--)
3847     {
3848         int w;
3849         uint8_t *d = byte_line;
3850         byte_line += stride;
3851         w = byte_width;
3852
3853         while (w >= 1 && ((unsigned long)d & 1))
3854         {
3855             *(uint8_t *)d = data;
3856             w -= 1;
3857             d += 1;
3858         }
3859
3860         while (w >= 2 && ((unsigned long)d & 3))
3861         {
3862             *(uint16_t *)d = data;
3863             w -= 2;
3864             d += 2;
3865         }
3866
3867         while (w >= 4 && ((unsigned long)d & 15))
3868         {
3869             *(uint32_t *)d = data;
3870
3871             w -= 4;
3872             d += 4;
3873         }
3874
3875         while (w >= 128)
3876         {
3877             save_128_aligned ((__m128i*)(d),     xmm_def);
3878             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3879             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3880             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3881             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3882             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3883             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3884             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3885
3886             d += 128;
3887             w -= 128;
3888         }
3889
3890         if (w >= 64)
3891         {
3892             save_128_aligned ((__m128i*)(d),     xmm_def);
3893             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3894             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3895             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3896
3897             d += 64;
3898             w -= 64;
3899         }
3900
3901         if (w >= 32)
3902         {
3903             save_128_aligned ((__m128i*)(d),     xmm_def);
3904             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3905
3906             d += 32;
3907             w -= 32;
3908         }
3909
3910         if (w >= 16)
3911         {
3912             save_128_aligned ((__m128i*)(d),     xmm_def);
3913
3914             d += 16;
3915             w -= 16;
3916         }
3917
3918         while (w >= 4)
3919         {
3920             *(uint32_t *)d = data;
3921
3922             w -= 4;
3923             d += 4;
3924         }
3925
3926         if (w >= 2)
3927         {
3928             *(uint16_t *)d = data;
3929             w -= 2;
3930             d += 2;
3931         }
3932
3933         if (w >= 1)
3934         {
3935             *(uint8_t *)d = data;
3936             w -= 1;
3937             d += 1;
3938         }
3939     }
3940
3941     _mm_empty ();
3942     return TRUE;
3943 }
3944
3945 static void
3946 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3947                              pixman_op_t              op,
3948                              pixman_image_t *         src_image,
3949                              pixman_image_t *         mask_image,
3950                              pixman_image_t *         dst_image,
3951                              int32_t                  src_x,
3952                              int32_t                  src_y,
3953                              int32_t                  mask_x,
3954                              int32_t                  mask_y,
3955                              int32_t                  dest_x,
3956                              int32_t                  dest_y,
3957                              int32_t                  width,
3958                              int32_t                  height)
3959 {
3960     uint32_t src, srca;
3961     uint32_t    *dst_line, *dst;
3962     uint8_t     *mask_line, *mask;
3963     int dst_stride, mask_stride;
3964     int32_t w;
3965     uint32_t m;
3966
3967     __m128i xmm_src, xmm_def;
3968     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3969
3970     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3971
3972     srca = src >> 24;
3973     if (src == 0)
3974     {
3975         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3976                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
3977                           dest_x, dest_y, width, height, 0);
3978         return;
3979     }
3980
3981     PIXMAN_IMAGE_GET_LINE (
3982         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3983     PIXMAN_IMAGE_GET_LINE (
3984         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3985
3986     xmm_def = create_mask_2x32_128 (src, src);
3987     xmm_src = expand_pixel_32_1x128 (src);
3988
3989     while (height--)
3990     {
3991         dst = dst_line;
3992         dst_line += dst_stride;
3993         mask = mask_line;
3994         mask_line += mask_stride;
3995         w = width;
3996
3997         while (w && (unsigned long)dst & 15)
3998         {
3999             uint8_t m = *mask++;
4000
4001             if (m)
4002             {
4003                 *dst = pack_1x128_32 (
4004                     pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
4005             }
4006             else
4007             {
4008                 *dst = 0;
4009             }
4010
4011             w--;
4012             dst++;
4013         }
4014
4015         while (w >= 4)
4016         {
4017             m = *((uint32_t*)mask);
4018
4019             if (srca == 0xff && m == 0xffffffff)
4020             {
4021                 save_128_aligned ((__m128i*)dst, xmm_def);
4022             }
4023             else if (m)
4024             {
4025                 xmm_mask = unpack_32_1x128 (m);
4026                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4027
4028                 /* Unpacking */
4029                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4030
4031                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4032                                         &xmm_mask_lo, &xmm_mask_hi);
4033
4034                 pix_multiply_2x128 (&xmm_src, &xmm_src,
4035                                     &xmm_mask_lo, &xmm_mask_hi,
4036                                     &xmm_mask_lo, &xmm_mask_hi);
4037
4038                 save_128_aligned (
4039                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4040             }
4041             else
4042             {
4043                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4044             }
4045
4046             w -= 4;
4047             dst += 4;
4048             mask += 4;
4049         }
4050
4051         while (w)
4052         {
4053             uint8_t m = *mask++;
4054
4055             if (m)
4056             {
4057                 *dst = pack_1x128_32 (
4058                     pix_multiply_1x128 (
4059                         xmm_src, expand_pixel_8_1x128 (m)));
4060             }
4061             else
4062             {
4063                 *dst = 0;
4064             }
4065
4066             w--;
4067             dst++;
4068         }
4069     }
4070
4071     _mm_empty ();
4072 }
4073
4074 /*-----------------------------------------------------------------------
4075  * composite_over_n_8_0565
4076  */
4077
4078 static void
4079 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4080                               pixman_op_t              op,
4081                               pixman_image_t *         src_image,
4082                               pixman_image_t *         mask_image,
4083                               pixman_image_t *         dst_image,
4084                               int32_t                  src_x,
4085                               int32_t                  src_y,
4086                               int32_t                  mask_x,
4087                               int32_t                  mask_y,
4088                               int32_t                  dest_x,
4089                               int32_t                  dest_y,
4090                               int32_t                  width,
4091                               int32_t                  height)
4092 {
4093     uint32_t src, srca;
4094     uint16_t    *dst_line, *dst, d;
4095     uint8_t     *mask_line, *mask;
4096     int dst_stride, mask_stride;
4097     int32_t w;
4098     uint32_t m;
4099     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4100
4101     __m128i xmm_src, xmm_alpha;
4102     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4103     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4104
4105     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4106
4107     srca = src >> 24;
4108     if (src == 0)
4109         return;
4110
4111     PIXMAN_IMAGE_GET_LINE (
4112         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4113     PIXMAN_IMAGE_GET_LINE (
4114         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4115
4116     xmm_src = expand_pixel_32_1x128 (src);
4117     xmm_alpha = expand_alpha_1x128 (xmm_src);
4118     mmx_src = xmm_src;
4119     mmx_alpha = xmm_alpha;
4120
4121     while (height--)
4122     {
4123         dst = dst_line;
4124         dst_line += dst_stride;
4125         mask = mask_line;
4126         mask_line += mask_stride;
4127         w = width;
4128
4129         while (w && (unsigned long)dst & 15)
4130         {
4131             m = *mask++;
4132
4133             if (m)
4134             {
4135                 d = *dst;
4136                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4137                 mmx_dest = expand565_16_1x128 (d);
4138
4139                 *dst = pack_565_32_16 (
4140                     pack_1x128_32 (
4141                         in_over_1x128 (
4142                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4143             }
4144
4145             w--;
4146             dst++;
4147         }
4148
4149         while (w >= 8)
4150         {
4151             xmm_dst = load_128_aligned ((__m128i*) dst);
4152             unpack_565_128_4x128 (xmm_dst,
4153                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4154
4155             m = *((uint32_t*)mask);
4156             mask += 4;
4157
4158             if (m)
4159             {
4160                 xmm_mask = unpack_32_1x128 (m);
4161                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4162
4163                 /* Unpacking */
4164                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4165
4166                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4167                                         &xmm_mask_lo, &xmm_mask_hi);
4168
4169                 in_over_2x128 (&xmm_src, &xmm_src,
4170                                &xmm_alpha, &xmm_alpha,
4171                                &xmm_mask_lo, &xmm_mask_hi,
4172                                &xmm_dst0, &xmm_dst1);
4173             }
4174
4175             m = *((uint32_t*)mask);
4176             mask += 4;
4177
4178             if (m)
4179             {
4180                 xmm_mask = unpack_32_1x128 (m);
4181                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4182
4183                 /* Unpacking */
4184                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4185
4186                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4187                                         &xmm_mask_lo, &xmm_mask_hi);
4188                 in_over_2x128 (&xmm_src, &xmm_src,
4189                                &xmm_alpha, &xmm_alpha,
4190                                &xmm_mask_lo, &xmm_mask_hi,
4191                                &xmm_dst2, &xmm_dst3);
4192             }
4193
4194             save_128_aligned (
4195                 (__m128i*)dst, pack_565_4x128_128 (
4196                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4197
4198             w -= 8;
4199             dst += 8;
4200         }
4201
4202         while (w)
4203         {
4204             m = *mask++;
4205
4206             if (m)
4207             {
4208                 d = *dst;
4209                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4210                 mmx_dest = expand565_16_1x128 (d);
4211
4212                 *dst = pack_565_32_16 (
4213                     pack_1x128_32 (
4214                         in_over_1x128 (
4215                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4216             }
4217
4218             w--;
4219             dst++;
4220         }
4221     }
4222
4223     _mm_empty ();
4224 }
4225
4226 /* -----------------------------------------------------------------------
4227  * composite_over_pixbuf_0565
4228  */
4229
4230 static void
4231 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4232                                  pixman_op_t              op,
4233                                  pixman_image_t *         src_image,
4234                                  pixman_image_t *         mask_image,
4235                                  pixman_image_t *         dst_image,
4236                                  int32_t                  src_x,
4237                                  int32_t                  src_y,
4238                                  int32_t                  mask_x,
4239                                  int32_t                  mask_y,
4240                                  int32_t                  dest_x,
4241                                  int32_t                  dest_y,
4242                                  int32_t                  width,
4243                                  int32_t                  height)
4244 {
4245     uint16_t    *dst_line, *dst, d;
4246     uint32_t    *src_line, *src, s;
4247     int dst_stride, src_stride;
4248     int32_t w;
4249     uint32_t opaque, zero;
4250
4251     __m128i ms;
4252     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4253     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4254
4255     PIXMAN_IMAGE_GET_LINE (
4256         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4257     PIXMAN_IMAGE_GET_LINE (
4258         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4259
4260 #if 0
4261     /* FIXME
4262      *
4263      * I copy the code from MMX one and keep the fixme.
4264      * If it's a problem there, probably is a problem here.
4265      */
4266     assert (src_image->drawable == mask_image->drawable);
4267 #endif
4268
4269     while (height--)
4270     {
4271         dst = dst_line;
4272         dst_line += dst_stride;
4273         src = src_line;
4274         src_line += src_stride;
4275         w = width;
4276
4277         while (w && (unsigned long)dst & 15)
4278         {
4279             s = *src++;
4280             d = *dst;
4281
4282             ms = unpack_32_1x128 (s);
4283
4284             *dst++ = pack_565_32_16 (
4285                 pack_1x128_32 (
4286                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
4287             w--;
4288         }
4289
4290         while (w >= 8)
4291         {
4292             /* First round */
4293             xmm_src = load_128_unaligned ((__m128i*)src);
4294             xmm_dst = load_128_aligned  ((__m128i*)dst);
4295
4296             opaque = is_opaque (xmm_src);
4297             zero = is_zero (xmm_src);
4298
4299             unpack_565_128_4x128 (xmm_dst,
4300                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4301             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4302
4303             /* preload next round*/
4304             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4305
4306             if (opaque)
4307             {
4308                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4309                                      &xmm_dst0, &xmm_dst1);
4310             }
4311             else if (!zero)
4312             {
4313                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4314                                         &xmm_dst0, &xmm_dst1);
4315             }
4316
4317             /* Second round */
4318             opaque = is_opaque (xmm_src);
4319             zero = is_zero (xmm_src);
4320
4321             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4322
4323             if (opaque)
4324             {
4325                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4326                                      &xmm_dst2, &xmm_dst3);
4327             }
4328             else if (!zero)
4329             {
4330                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4331                                         &xmm_dst2, &xmm_dst3);
4332             }
4333
4334             save_128_aligned (
4335                 (__m128i*)dst, pack_565_4x128_128 (
4336                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4337
4338             w -= 8;
4339             src += 8;
4340             dst += 8;
4341         }
4342
4343         while (w)
4344         {
4345             s = *src++;
4346             d = *dst;
4347
4348             ms = unpack_32_1x128 (s);
4349
4350             *dst++ = pack_565_32_16 (
4351                 pack_1x128_32 (
4352                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
4353             w--;
4354         }
4355     }
4356
4357     _mm_empty ();
4358 }
4359
4360 /* -------------------------------------------------------------------------
4361  * composite_over_pixbuf_8888
4362  */
4363
4364 static void
4365 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4366                                  pixman_op_t              op,
4367                                  pixman_image_t *         src_image,
4368                                  pixman_image_t *         mask_image,
4369                                  pixman_image_t *         dst_image,
4370                                  int32_t                  src_x,
4371                                  int32_t                  src_y,
4372                                  int32_t                  mask_x,
4373                                  int32_t                  mask_y,
4374                                  int32_t                  dest_x,
4375                                  int32_t                  dest_y,
4376                                  int32_t                  width,
4377                                  int32_t                  height)
4378 {
4379     uint32_t    *dst_line, *dst, d;
4380     uint32_t    *src_line, *src, s;
4381     int dst_stride, src_stride;
4382     int32_t w;
4383     uint32_t opaque, zero;
4384
4385     __m128i xmm_src_lo, xmm_src_hi;
4386     __m128i xmm_dst_lo, xmm_dst_hi;
4387
4388     PIXMAN_IMAGE_GET_LINE (
4389         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4390     PIXMAN_IMAGE_GET_LINE (
4391         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4392
4393 #if 0
4394     /* FIXME
4395      *
4396      * I copy the code from MMX one and keep the fixme.
4397      * If it's a problem there, probably is a problem here.
4398      */
4399     assert (src_image->drawable == mask_image->drawable);
4400 #endif
4401
4402     while (height--)
4403     {
4404         dst = dst_line;
4405         dst_line += dst_stride;
4406         src = src_line;
4407         src_line += src_stride;
4408         w = width;
4409
4410         while (w && (unsigned long)dst & 15)
4411         {
4412             s = *src++;
4413             d = *dst;
4414
4415             *dst++ = pack_1x128_32 (
4416                 over_rev_non_pre_1x128 (
4417                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4418
4419             w--;
4420         }
4421
4422         while (w >= 4)
4423         {
4424             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4425
4426             opaque = is_opaque (xmm_src_hi);
4427             zero = is_zero (xmm_src_hi);
4428
4429             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4430
4431             if (opaque)
4432             {
4433                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4434                                      &xmm_dst_lo, &xmm_dst_hi);
4435
4436                 save_128_aligned (
4437                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4438             }
4439             else if (!zero)
4440             {
4441                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4442
4443                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4444
4445                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4446                                         &xmm_dst_lo, &xmm_dst_hi);
4447
4448                 save_128_aligned (
4449                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4450             }
4451
4452             w -= 4;
4453             dst += 4;
4454             src += 4;
4455         }
4456
4457         while (w)
4458         {
4459             s = *src++;
4460             d = *dst;
4461
4462             *dst++ = pack_1x128_32 (
4463                 over_rev_non_pre_1x128 (
4464                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4465
4466             w--;
4467         }
4468     }
4469
4470     _mm_empty ();
4471 }
4472
4473 /* -------------------------------------------------------------------------------------------------
4474  * composite_over_n_8888_0565_ca
4475  */
4476
4477 static void
4478 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4479                                     pixman_op_t              op,
4480                                     pixman_image_t *         src_image,
4481                                     pixman_image_t *         mask_image,
4482                                     pixman_image_t *         dst_image,
4483                                     int32_t                  src_x,
4484                                     int32_t                  src_y,
4485                                     int32_t                  mask_x,
4486                                     int32_t                  mask_y,
4487                                     int32_t                  dest_x,
4488                                     int32_t                  dest_y,
4489                                     int32_t                  width,
4490                                     int32_t                  height)
4491 {
4492     uint32_t src;
4493     uint16_t    *dst_line, *dst, d;
4494     uint32_t    *mask_line, *mask, m;
4495     int dst_stride, mask_stride;
4496     int w;
4497     uint32_t pack_cmp;
4498
4499     __m128i xmm_src, xmm_alpha;
4500     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4501     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4502
4503     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4504
4505     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4506
4507     if (src == 0)
4508         return;
4509
4510     PIXMAN_IMAGE_GET_LINE (
4511         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4512     PIXMAN_IMAGE_GET_LINE (
4513         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4514
4515     xmm_src = expand_pixel_32_1x128 (src);
4516     xmm_alpha = expand_alpha_1x128 (xmm_src);
4517     mmx_src = xmm_src;
4518     mmx_alpha = xmm_alpha;
4519
4520     while (height--)
4521     {
4522         w = width;
4523         mask = mask_line;
4524         dst = dst_line;
4525         mask_line += mask_stride;
4526         dst_line += dst_stride;
4527
4528         while (w && ((unsigned long)dst & 15))
4529         {
4530             m = *(uint32_t *) mask;
4531
4532             if (m)
4533             {
4534                 d = *dst;
4535                 mmx_mask = unpack_32_1x128 (m);
4536                 mmx_dest = expand565_16_1x128 (d);
4537
4538                 *dst = pack_565_32_16 (
4539                     pack_1x128_32 (
4540                         in_over_1x128 (
4541                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4542             }
4543
4544             w--;
4545             dst++;
4546             mask++;
4547         }
4548
4549         while (w >= 8)
4550         {
4551             /* First round */
4552             xmm_mask = load_128_unaligned ((__m128i*)mask);
4553             xmm_dst = load_128_aligned ((__m128i*)dst);
4554
4555             pack_cmp = _mm_movemask_epi8 (
4556                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4557
4558             unpack_565_128_4x128 (xmm_dst,
4559                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4560             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4561
4562             /* preload next round */
4563             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4564
4565             /* preload next round */
4566             if (pack_cmp != 0xffff)
4567             {
4568                 in_over_2x128 (&xmm_src, &xmm_src,
4569                                &xmm_alpha, &xmm_alpha,
4570                                &xmm_mask_lo, &xmm_mask_hi,
4571                                &xmm_dst0, &xmm_dst1);
4572             }
4573
4574             /* Second round */
4575             pack_cmp = _mm_movemask_epi8 (
4576                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4577
4578             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4579
4580             if (pack_cmp != 0xffff)
4581             {
4582                 in_over_2x128 (&xmm_src, &xmm_src,
4583                                &xmm_alpha, &xmm_alpha,
4584                                &xmm_mask_lo, &xmm_mask_hi,
4585                                &xmm_dst2, &xmm_dst3);
4586             }
4587
4588             save_128_aligned (
4589                 (__m128i*)dst, pack_565_4x128_128 (
4590                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4591
4592             w -= 8;
4593             dst += 8;
4594             mask += 8;
4595         }
4596
4597         while (w)
4598         {
4599             m = *(uint32_t *) mask;
4600
4601             if (m)
4602             {
4603                 d = *dst;
4604                 mmx_mask = unpack_32_1x128 (m);
4605                 mmx_dest = expand565_16_1x128 (d);
4606
4607                 *dst = pack_565_32_16 (
4608                     pack_1x128_32 (
4609                         in_over_1x128 (
4610                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4611             }
4612
4613             w--;
4614             dst++;
4615             mask++;
4616         }
4617     }
4618
4619     _mm_empty ();
4620 }
4621
4622 /* -----------------------------------------------------------------------
4623  * composite_in_n_8_8
4624  */
4625
4626 static void
4627 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4628                          pixman_op_t              op,
4629                          pixman_image_t *         src_image,
4630                          pixman_image_t *         mask_image,
4631                          pixman_image_t *         dst_image,
4632                          int32_t                  src_x,
4633                          int32_t                  src_y,
4634                          int32_t                  mask_x,
4635                          int32_t                  mask_y,
4636                          int32_t                  dest_x,
4637                          int32_t                  dest_y,
4638                          int32_t                  width,
4639                          int32_t                  height)
4640 {
4641     uint8_t     *dst_line, *dst;
4642     uint8_t     *mask_line, *mask;
4643     int dst_stride, mask_stride;
4644     uint32_t d, m;
4645     uint32_t src;
4646     uint8_t sa;
4647     int32_t w;
4648
4649     __m128i xmm_alpha;
4650     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4651     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4652
4653     PIXMAN_IMAGE_GET_LINE (
4654         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4655     PIXMAN_IMAGE_GET_LINE (
4656         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4657
4658     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4659
4660     sa = src >> 24;
4661
4662     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4663
4664     while (height--)
4665     {
4666         dst = dst_line;
4667         dst_line += dst_stride;
4668         mask = mask_line;
4669         mask_line += mask_stride;
4670         w = width;
4671
4672         while (w && ((unsigned long)dst & 15))
4673         {
4674             m = (uint32_t) *mask++;
4675             d = (uint32_t) *dst;
4676
4677             *dst++ = (uint8_t) pack_1x128_32 (
4678                 pix_multiply_1x128 (
4679                     pix_multiply_1x128 (xmm_alpha,
4680                                        unpack_32_1x128 (m)),
4681                     unpack_32_1x128 (d)));
4682             w--;
4683         }
4684
4685         while (w >= 16)
4686         {
4687             xmm_mask = load_128_unaligned ((__m128i*)mask);
4688             xmm_dst = load_128_aligned ((__m128i*)dst);
4689
4690             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4691             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4692
4693             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4694                                 &xmm_mask_lo, &xmm_mask_hi,
4695                                 &xmm_mask_lo, &xmm_mask_hi);
4696
4697             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4698                                 &xmm_dst_lo, &xmm_dst_hi,
4699                                 &xmm_dst_lo, &xmm_dst_hi);
4700
4701             save_128_aligned (
4702                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4703
4704             mask += 16;
4705             dst += 16;
4706             w -= 16;
4707         }
4708
4709         while (w)
4710         {
4711             m = (uint32_t) *mask++;
4712             d = (uint32_t) *dst;
4713
4714             *dst++ = (uint8_t) pack_1x128_32 (
4715                 pix_multiply_1x128 (
4716                     pix_multiply_1x128 (
4717                         xmm_alpha, unpack_32_1x128 (m)),
4718                     unpack_32_1x128 (d)));
4719             w--;
4720         }
4721     }
4722
4723     _mm_empty ();
4724 }
4725
4726 /* -----------------------------------------------------------------------
4727  * composite_in_n_8
4728  */
4729
4730 static void
4731 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4732                        pixman_op_t              op,
4733                        pixman_image_t *         src_image,
4734                        pixman_image_t *         mask_image,
4735                        pixman_image_t *         dst_image,
4736                        int32_t                  src_x,
4737                        int32_t                  src_y,
4738                        int32_t                  mask_x,
4739                        int32_t                  mask_y,
4740                        int32_t                  dest_x,
4741                        int32_t                  dest_y,
4742                        int32_t                  width,
4743                        int32_t                  height)
4744 {
4745     uint8_t     *dst_line, *dst;
4746     int dst_stride;
4747     uint32_t d;
4748     uint32_t src;
4749     int32_t w;
4750
4751     __m128i xmm_alpha;
4752     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4753
4754     PIXMAN_IMAGE_GET_LINE (
4755         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4756
4757     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4758
4759     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4760
4761     src = src >> 24;
4762
4763     if (src == 0xff)
4764         return;
4765
4766     if (src == 0x00)
4767     {
4768         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4769                      8, dest_x, dest_y, width, height, src);
4770
4771         return;
4772     }
4773
4774     while (height--)
4775     {
4776         dst = dst_line;
4777         dst_line += dst_stride;
4778         w = width;
4779
4780         while (w && ((unsigned long)dst & 15))
4781         {
4782             d = (uint32_t) *dst;
4783
4784             *dst++ = (uint8_t) pack_1x128_32 (
4785                 pix_multiply_1x128 (
4786                     xmm_alpha,
4787                     unpack_32_1x128 (d)));
4788             w--;
4789         }
4790
4791         while (w >= 16)
4792         {
4793             xmm_dst = load_128_aligned ((__m128i*)dst);
4794
4795             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4796
4797             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4798                                 &xmm_dst_lo, &xmm_dst_hi,
4799                                 &xmm_dst_lo, &xmm_dst_hi);
4800
4801             save_128_aligned (
4802                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4803
4804             dst += 16;
4805             w -= 16;
4806         }
4807
4808         while (w)
4809         {
4810             d = (uint32_t) *dst;
4811
4812             *dst++ = (uint8_t) pack_1x128_32 (
4813                 pix_multiply_1x128 (
4814                     xmm_alpha,
4815                     unpack_32_1x128 (d)));
4816             w--;
4817         }
4818     }
4819
4820     _mm_empty ();
4821 }
4822
4823 /* ---------------------------------------------------------------------------
4824  * composite_in_8_8
4825  */
4826
4827 static void
4828 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4829                        pixman_op_t              op,
4830                        pixman_image_t *         src_image,
4831                        pixman_image_t *         mask_image,
4832                        pixman_image_t *         dst_image,
4833                        int32_t                  src_x,
4834                        int32_t                  src_y,
4835                        int32_t                  mask_x,
4836                        int32_t                  mask_y,
4837                        int32_t                  dest_x,
4838                        int32_t                  dest_y,
4839                        int32_t                  width,
4840                        int32_t                  height)
4841 {
4842     uint8_t     *dst_line, *dst;
4843     uint8_t     *src_line, *src;
4844     int src_stride, dst_stride;
4845     int32_t w;
4846     uint32_t s, d;
4847
4848     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4849     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4850
4851     PIXMAN_IMAGE_GET_LINE (
4852         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4853     PIXMAN_IMAGE_GET_LINE (
4854         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4855
4856     while (height--)
4857     {
4858         dst = dst_line;
4859         dst_line += dst_stride;
4860         src = src_line;
4861         src_line += src_stride;
4862         w = width;
4863
4864         while (w && ((unsigned long)dst & 15))
4865         {
4866             s = (uint32_t) *src++;
4867             d = (uint32_t) *dst;
4868
4869             *dst++ = (uint8_t) pack_1x128_32 (
4870                 pix_multiply_1x128 (
4871                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4872             w--;
4873         }
4874
4875         while (w >= 16)
4876         {
4877             xmm_src = load_128_unaligned ((__m128i*)src);
4878             xmm_dst = load_128_aligned ((__m128i*)dst);
4879
4880             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4881             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4882
4883             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4884                                 &xmm_dst_lo, &xmm_dst_hi,
4885                                 &xmm_dst_lo, &xmm_dst_hi);
4886
4887             save_128_aligned (
4888                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4889
4890             src += 16;
4891             dst += 16;
4892             w -= 16;
4893         }
4894
4895         while (w)
4896         {
4897             s = (uint32_t) *src++;
4898             d = (uint32_t) *dst;
4899
4900             *dst++ = (uint8_t) pack_1x128_32 (
4901                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4902             w--;
4903         }
4904     }
4905
4906     _mm_empty ();
4907 }
4908
4909 /* -------------------------------------------------------------------------
4910  * composite_add_n_8_8
4911  */
4912
4913 static void
4914 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4915                           pixman_op_t              op,
4916                           pixman_image_t *         src_image,
4917                           pixman_image_t *         mask_image,
4918                           pixman_image_t *         dst_image,
4919                           int32_t                  src_x,
4920                           int32_t                  src_y,
4921                           int32_t                  mask_x,
4922                           int32_t                  mask_y,
4923                           int32_t                  dest_x,
4924                           int32_t                  dest_y,
4925                           int32_t                  width,
4926                           int32_t                  height)
4927 {
4928     uint8_t     *dst_line, *dst;
4929     uint8_t     *mask_line, *mask;
4930     int dst_stride, mask_stride;
4931     int32_t w;
4932     uint32_t src;
4933     uint8_t sa;
4934     uint32_t m, d;
4935
4936     __m128i xmm_alpha;
4937     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4938     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4939
4940     PIXMAN_IMAGE_GET_LINE (
4941         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4942     PIXMAN_IMAGE_GET_LINE (
4943         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4944
4945     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4946
4947     sa = src >> 24;
4948
4949     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4950
4951     while (height--)
4952     {
4953         dst = dst_line;
4954         dst_line += dst_stride;
4955         mask = mask_line;
4956         mask_line += mask_stride;
4957         w = width;
4958
4959         while (w && ((unsigned long)dst & 15))
4960         {
4961             m = (uint32_t) *mask++;
4962             d = (uint32_t) *dst;
4963
4964             *dst++ = (uint8_t) pack_1x128_32 (
4965                 _mm_adds_epu16 (
4966                     pix_multiply_1x128 (
4967                         xmm_alpha, unpack_32_1x128 (m)),
4968                     unpack_32_1x128 (d)));
4969             w--;
4970         }
4971
4972         while (w >= 16)
4973         {
4974             xmm_mask = load_128_unaligned ((__m128i*)mask);
4975             xmm_dst = load_128_aligned ((__m128i*)dst);
4976
4977             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4978             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4979
4980             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4981                                 &xmm_mask_lo, &xmm_mask_hi,
4982                                 &xmm_mask_lo, &xmm_mask_hi);
4983
4984             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4985             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4986
4987             save_128_aligned (
4988                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4989
4990             mask += 16;
4991             dst += 16;
4992             w -= 16;
4993         }
4994
4995         while (w)
4996         {
4997             m = (uint32_t) *mask++;
4998             d = (uint32_t) *dst;
4999
5000             *dst++ = (uint8_t) pack_1x128_32 (
5001                 _mm_adds_epu16 (
5002                     pix_multiply_1x128 (
5003                         xmm_alpha, unpack_32_1x128 (m)),
5004                     unpack_32_1x128 (d)));
5005
5006             w--;
5007         }
5008     }
5009
5010     _mm_empty ();
5011 }
5012
5013 /* -------------------------------------------------------------------------
5014  * composite_add_n_8_8
5015  */
5016
5017 static void
5018 sse2_composite_add_n_8 (pixman_implementation_t *imp,
5019                         pixman_op_t              op,
5020                         pixman_image_t *         src_image,
5021                         pixman_image_t *         mask_image,
5022                         pixman_image_t *         dst_image,
5023                         int32_t                  src_x,
5024                         int32_t                  src_y,
5025                         int32_t                  mask_x,
5026                         int32_t                  mask_y,
5027                         int32_t                  dest_x,
5028                         int32_t                  dest_y,
5029                         int32_t                  width,
5030                         int32_t                  height)
5031 {
5032     uint8_t     *dst_line, *dst;
5033     int dst_stride;
5034     int32_t w;
5035     uint32_t src;
5036
5037     __m128i xmm_src;
5038
5039     PIXMAN_IMAGE_GET_LINE (
5040         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5041
5042     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5043
5044     src >>= 24;
5045
5046     if (src == 0x00)
5047         return;
5048
5049     if (src == 0xff)
5050     {
5051         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
5052                      8, dest_x, dest_y, width, height, 0xff);
5053
5054         return;
5055     }
5056
5057     src = (src << 24) | (src << 16) | (src << 8) | src;
5058     xmm_src = _mm_set_epi32 (src, src, src, src);
5059
5060     while (height--)
5061     {
5062         dst = dst_line;
5063         dst_line += dst_stride;
5064         w = width;
5065
5066         while (w && ((unsigned long)dst & 15))
5067         {
5068             *dst = (uint8_t)_mm_cvtsi128_si32 (
5069                 _mm_adds_epu8 (
5070                     xmm_src,
5071                     _mm_cvtsi32_si128 (*dst)));
5072
5073             w--;
5074             dst++;
5075         }
5076
5077         while (w >= 16)
5078         {
5079             save_128_aligned (
5080                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
5081
5082             dst += 16;
5083             w -= 16;
5084         }
5085
5086         while (w)
5087         {
5088             *dst = (uint8_t)_mm_cvtsi128_si32 (
5089                 _mm_adds_epu8 (
5090                     xmm_src,
5091                     _mm_cvtsi32_si128 (*dst)));
5092
5093             w--;
5094             dst++;
5095         }
5096     }
5097
5098     _mm_empty ();
5099 }
5100
5101 /* ----------------------------------------------------------------------
5102  * composite_add_8_8
5103  */
5104
5105 static void
5106 sse2_composite_add_8_8 (pixman_implementation_t *imp,
5107                         pixman_op_t              op,
5108                         pixman_image_t *         src_image,
5109                         pixman_image_t *         mask_image,
5110                         pixman_image_t *         dst_image,
5111                         int32_t                  src_x,
5112                         int32_t                  src_y,
5113                         int32_t                  mask_x,
5114                         int32_t                  mask_y,
5115                         int32_t                  dest_x,
5116                         int32_t                  dest_y,
5117                         int32_t                  width,
5118                         int32_t                  height)
5119 {
5120     uint8_t     *dst_line, *dst;
5121     uint8_t     *src_line, *src;
5122     int dst_stride, src_stride;
5123     int32_t w;
5124     uint16_t t;
5125
5126     PIXMAN_IMAGE_GET_LINE (
5127         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5128     PIXMAN_IMAGE_GET_LINE (
5129         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5130
5131     while (height--)
5132     {
5133         dst = dst_line;
5134         src = src_line;
5135
5136         dst_line += dst_stride;
5137         src_line += src_stride;
5138         w = width;
5139
5140         /* Small head */
5141         while (w && (unsigned long)dst & 3)
5142         {
5143             t = (*dst) + (*src++);
5144             *dst++ = t | (0 - (t >> 8));
5145             w--;
5146         }
5147
5148         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5149
5150         /* Small tail */
5151         dst += w & 0xfffc;
5152         src += w & 0xfffc;
5153
5154         w &= 3;
5155
5156         while (w)
5157         {
5158             t = (*dst) + (*src++);
5159             *dst++ = t | (0 - (t >> 8));
5160             w--;
5161         }
5162     }
5163
5164     _mm_empty ();
5165 }
5166
5167 /* ---------------------------------------------------------------------
5168  * composite_add_8888_8888
5169  */
5170 static void
5171 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5172                               pixman_op_t              op,
5173                               pixman_image_t *         src_image,
5174                               pixman_image_t *         mask_image,
5175                               pixman_image_t *         dst_image,
5176                               int32_t                  src_x,
5177                               int32_t                  src_y,
5178                               int32_t                  mask_x,
5179                               int32_t                  mask_y,
5180                               int32_t                  dest_x,
5181                               int32_t                  dest_y,
5182                               int32_t                  width,
5183                               int32_t                  height)
5184 {
5185     uint32_t    *dst_line, *dst;
5186     uint32_t    *src_line, *src;
5187     int dst_stride, src_stride;
5188
5189     PIXMAN_IMAGE_GET_LINE (
5190         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5191     PIXMAN_IMAGE_GET_LINE (
5192         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5193
5194     while (height--)
5195     {
5196         dst = dst_line;
5197         dst_line += dst_stride;
5198         src = src_line;
5199         src_line += src_stride;
5200
5201         core_combine_add_u_sse2 (dst, src, NULL, width);
5202     }
5203
5204     _mm_empty ();
5205 }
5206
5207 /* -------------------------------------------------------------------------------------------------
5208  * sse2_composite_copy_area
5209  */
5210
5211 static pixman_bool_t
5212 pixman_blt_sse2 (uint32_t *src_bits,
5213                  uint32_t *dst_bits,
5214                  int       src_stride,
5215                  int       dst_stride,
5216                  int       src_bpp,
5217                  int       dst_bpp,
5218                  int       src_x,
5219                  int       src_y,
5220                  int       dst_x,
5221                  int       dst_y,
5222                  int       width,
5223                  int       height)
5224 {
5225     uint8_t *   src_bytes;
5226     uint8_t *   dst_bytes;
5227     int byte_width;
5228
5229     if (src_bpp != dst_bpp)
5230         return FALSE;
5231
5232     if (src_bpp == 16)
5233     {
5234         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5235         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5236         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5237         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5238         byte_width = 2 * width;
5239         src_stride *= 2;
5240         dst_stride *= 2;
5241     }
5242     else if (src_bpp == 32)
5243     {
5244         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5245         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5246         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5247         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5248         byte_width = 4 * width;
5249         src_stride *= 4;
5250         dst_stride *= 4;
5251     }
5252     else
5253     {
5254         return FALSE;
5255     }
5256
5257     while (height--)
5258     {
5259         int w;
5260         uint8_t *s = src_bytes;
5261         uint8_t *d = dst_bytes;
5262         src_bytes += src_stride;
5263         dst_bytes += dst_stride;
5264         w = byte_width;
5265
5266         while (w >= 2 && ((unsigned long)d & 3))
5267         {
5268             *(uint16_t *)d = *(uint16_t *)s;
5269             w -= 2;
5270             s += 2;
5271             d += 2;
5272         }
5273
5274         while (w >= 4 && ((unsigned long)d & 15))
5275         {
5276             *(uint32_t *)d = *(uint32_t *)s;
5277
5278             w -= 4;
5279             s += 4;
5280             d += 4;
5281         }
5282
5283         while (w >= 64)
5284         {
5285             __m128i xmm0, xmm1, xmm2, xmm3;
5286
5287             xmm0 = load_128_unaligned ((__m128i*)(s));
5288             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5289             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5290             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5291
5292             save_128_aligned ((__m128i*)(d),    xmm0);
5293             save_128_aligned ((__m128i*)(d + 16), xmm1);
5294             save_128_aligned ((__m128i*)(d + 32), xmm2);
5295             save_128_aligned ((__m128i*)(d + 48), xmm3);
5296
5297             s += 64;
5298             d += 64;
5299             w -= 64;
5300         }
5301
5302         while (w >= 16)
5303         {
5304             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5305
5306             w -= 16;
5307             d += 16;
5308             s += 16;
5309         }
5310
5311         while (w >= 4)
5312         {
5313             *(uint32_t *)d = *(uint32_t *)s;
5314
5315             w -= 4;
5316             s += 4;
5317             d += 4;
5318         }
5319
5320         if (w >= 2)
5321         {
5322             *(uint16_t *)d = *(uint16_t *)s;
5323             w -= 2;
5324             s += 2;
5325             d += 2;
5326         }
5327     }
5328
5329     _mm_empty ();
5330
5331     return TRUE;
5332 }
5333
5334 static void
5335 sse2_composite_copy_area (pixman_implementation_t *imp,
5336                           pixman_op_t              op,
5337                           pixman_image_t *         src_image,
5338                           pixman_image_t *         mask_image,
5339                           pixman_image_t *         dst_image,
5340                           int32_t                  src_x,
5341                           int32_t                  src_y,
5342                           int32_t                  mask_x,
5343                           int32_t                  mask_y,
5344                           int32_t                  dest_x,
5345                           int32_t                  dest_y,
5346                           int32_t                  width,
5347                           int32_t                  height)
5348 {
5349     pixman_blt_sse2 (src_image->bits.bits,
5350                      dst_image->bits.bits,
5351                      src_image->bits.rowstride,
5352                      dst_image->bits.rowstride,
5353                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5354                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5355                      src_x, src_y, dest_x, dest_y, width, height);
5356 }
5357
5358 static void
5359 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5360                                  pixman_op_t              op,
5361                                  pixman_image_t *         src_image,
5362                                  pixman_image_t *         mask_image,
5363                                  pixman_image_t *         dst_image,
5364                                  int32_t                  src_x,
5365                                  int32_t                  src_y,
5366                                  int32_t                  mask_x,
5367                                  int32_t                  mask_y,
5368                                  int32_t                  dest_x,
5369                                  int32_t                  dest_y,
5370                                  int32_t                  width,
5371                                  int32_t                  height)
5372 {
5373     uint32_t    *src, *src_line, s;
5374     uint32_t    *dst, *dst_line, d;
5375     uint8_t         *mask, *mask_line;
5376     uint32_t m;
5377     int src_stride, mask_stride, dst_stride;
5378     int32_t w;
5379     __m128i ms;
5380
5381     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5382     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5383     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5384
5385     PIXMAN_IMAGE_GET_LINE (
5386         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5387     PIXMAN_IMAGE_GET_LINE (
5388         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5389     PIXMAN_IMAGE_GET_LINE (
5390         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5391
5392     while (height--)
5393     {
5394         src = src_line;
5395         src_line += src_stride;
5396         dst = dst_line;
5397         dst_line += dst_stride;
5398         mask = mask_line;
5399         mask_line += mask_stride;
5400
5401         w = width;
5402
5403         while (w && (unsigned long)dst & 15)
5404         {
5405             s = 0xff000000 | *src++;
5406             m = (uint32_t) *mask++;
5407             d = *dst;
5408             ms = unpack_32_1x128 (s);
5409
5410             if (m != 0xff)
5411             {
5412                 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
5413                 __m128i md = unpack_32_1x128 (d);
5414
5415                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
5416             }
5417
5418             *dst++ = pack_1x128_32 (ms);
5419             w--;
5420         }
5421
5422         while (w >= 4)
5423         {
5424             m = *(uint32_t*) mask;
5425             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5426
5427             if (m == 0xffffffff)
5428             {
5429                 save_128_aligned ((__m128i*)dst, xmm_src);
5430             }
5431             else
5432             {
5433                 xmm_dst = load_128_aligned ((__m128i*)dst);
5434
5435                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5436
5437                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5438                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5439                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5440
5441                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5442
5443                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5444
5445                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5446             }
5447
5448             src += 4;
5449             dst += 4;
5450             mask += 4;
5451             w -= 4;
5452         }
5453
5454         while (w)
5455         {
5456             m = (uint32_t) *mask++;
5457
5458             if (m)
5459             {
5460                 s = 0xff000000 | *src;
5461
5462                 if (m == 0xff)
5463                 {
5464                     *dst = s;
5465                 }
5466                 else
5467                 {
5468                     __m128i ma, md, ms;
5469
5470                     d = *dst;
5471
5472                     ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
5473                     md = unpack_32_1x128 (d);
5474                     ms = unpack_32_1x128 (s);
5475
5476                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
5477                 }
5478
5479             }
5480
5481             src++;
5482             dst++;
5483             w--;
5484         }
5485     }
5486
5487     _mm_empty ();
5488 }
5489
5490 static void
5491 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5492                                  pixman_op_t              op,
5493                                  pixman_image_t *         src_image,
5494                                  pixman_image_t *         mask_image,
5495                                  pixman_image_t *         dst_image,
5496                                  int32_t                  src_x,
5497                                  int32_t                  src_y,
5498                                  int32_t                  mask_x,
5499                                  int32_t                  mask_y,
5500                                  int32_t                  dest_x,
5501                                  int32_t                  dest_y,
5502                                  int32_t                  width,
5503                                  int32_t                  height)
5504 {
5505     uint32_t    *src, *src_line, s;
5506     uint32_t    *dst, *dst_line, d;
5507     uint8_t         *mask, *mask_line;
5508     uint32_t m;
5509     int src_stride, mask_stride, dst_stride;
5510     int32_t w;
5511
5512     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5513     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5514     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5515
5516     PIXMAN_IMAGE_GET_LINE (
5517         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5518     PIXMAN_IMAGE_GET_LINE (
5519         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5520     PIXMAN_IMAGE_GET_LINE (
5521         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5522
5523     while (height--)
5524     {
5525         src = src_line;
5526         src_line += src_stride;
5527         dst = dst_line;
5528         dst_line += dst_stride;
5529         mask = mask_line;
5530         mask_line += mask_stride;
5531
5532         w = width;
5533
5534         while (w && (unsigned long)dst & 15)
5535         {
5536             uint32_t sa;
5537
5538             s = *src++;
5539             m = (uint32_t) *mask++;
5540             d = *dst;
5541
5542             sa = s >> 24;
5543
5544             if (m)
5545             {
5546                 if (sa == 0xff && m == 0xff)
5547                 {
5548                     *dst = s;
5549                 }
5550                 else
5551                 {
5552                     __m128i ms, md, ma, msa;
5553
5554                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5555                     ms = unpack_32_1x128 (s);
5556                     md = unpack_32_1x128 (d);
5557
5558                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5559
5560                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5561                 }
5562             }
5563
5564             dst++;
5565             w--;
5566         }
5567
5568         while (w >= 4)
5569         {
5570             m = *(uint32_t *) mask;
5571
5572             if (m)
5573             {
5574                 xmm_src = load_128_unaligned ((__m128i*)src);
5575
5576                 if (m == 0xffffffff && is_opaque (xmm_src))
5577                 {
5578                     save_128_aligned ((__m128i *)dst, xmm_src);
5579                 }
5580                 else
5581                 {
5582                     xmm_dst = load_128_aligned ((__m128i *)dst);
5583
5584                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5585
5586                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5587                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5588                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5589
5590                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5591                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5592
5593                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5594                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5595
5596                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5597                 }
5598             }
5599
5600             src += 4;
5601             dst += 4;
5602             mask += 4;
5603             w -= 4;
5604         }
5605
5606         while (w)
5607         {
5608             uint32_t sa;
5609
5610             s = *src++;
5611             m = (uint32_t) *mask++;
5612             d = *dst;
5613
5614             sa = s >> 24;
5615
5616             if (m)
5617             {
5618                 if (sa == 0xff && m == 0xff)
5619                 {
5620                     *dst = s;
5621                 }
5622                 else
5623                 {
5624                     __m128i ms, md, ma, msa;
5625
5626                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5627                     ms = unpack_32_1x128 (s);
5628                     md = unpack_32_1x128 (d);
5629
5630                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5631
5632                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5633                 }
5634             }
5635
5636             dst++;
5637             w--;
5638         }
5639     }
5640
5641     _mm_empty ();
5642 }
5643
5644 static void
5645 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5646                                     pixman_op_t              op,
5647                                     pixman_image_t *         src_image,
5648                                     pixman_image_t *         mask_image,
5649                                     pixman_image_t *         dst_image,
5650                                     int32_t                  src_x,
5651                                     int32_t                  src_y,
5652                                     int32_t                  mask_x,
5653                                     int32_t                  mask_y,
5654                                     int32_t                  dest_x,
5655                                     int32_t                  dest_y,
5656                                     int32_t                  width,
5657                                     int32_t                  height)
5658 {
5659     uint32_t src;
5660     uint32_t    *dst_line, *dst;
5661     __m128i xmm_src;
5662     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5663     __m128i xmm_dsta_hi, xmm_dsta_lo;
5664     int dst_stride;
5665     int32_t w;
5666
5667     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5668
5669     if (src == 0)
5670         return;
5671
5672     PIXMAN_IMAGE_GET_LINE (
5673         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5674
5675     xmm_src = expand_pixel_32_1x128 (src);
5676
5677     while (height--)
5678     {
5679         dst = dst_line;
5680
5681         dst_line += dst_stride;
5682         w = width;
5683
5684         while (w && (unsigned long)dst & 15)
5685         {
5686             __m128i vd;
5687
5688             vd = unpack_32_1x128 (*dst);
5689
5690             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5691                                               xmm_src));
5692             w--;
5693             dst++;
5694         }
5695
5696         while (w >= 4)
5697         {
5698             __m128i tmp_lo, tmp_hi;
5699
5700             xmm_dst = load_128_aligned ((__m128i*)dst);
5701
5702             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5703             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5704
5705             tmp_lo = xmm_src;
5706             tmp_hi = xmm_src;
5707
5708             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5709                         &xmm_dsta_lo, &xmm_dsta_hi,
5710                         &tmp_lo, &tmp_hi);
5711
5712             save_128_aligned (
5713                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5714
5715             w -= 4;
5716             dst += 4;
5717         }
5718
5719         while (w)
5720         {
5721             __m128i vd;
5722
5723             vd = unpack_32_1x128 (*dst);
5724
5725             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5726                                               xmm_src));
5727             w--;
5728             dst++;
5729         }
5730
5731     }
5732
5733     _mm_empty ();
5734 }
5735
5736 static void
5737 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5738                                     pixman_op_t              op,
5739                                     pixman_image_t *         src_image,
5740                                     pixman_image_t *         mask_image,
5741                                     pixman_image_t *         dst_image,
5742                                     int32_t                  src_x,
5743                                     int32_t                  src_y,
5744                                     int32_t                  mask_x,
5745                                     int32_t                  mask_y,
5746                                     int32_t                  dest_x,
5747                                     int32_t                  dest_y,
5748                                     int32_t                  width,
5749                                     int32_t                  height)
5750 {
5751     uint32_t    *src, *src_line, s;
5752     uint32_t    *dst, *dst_line, d;
5753     uint32_t    *mask, *mask_line;
5754     uint32_t    m;
5755     int src_stride, mask_stride, dst_stride;
5756     int32_t w;
5757
5758     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5759     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5760     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5761
5762     PIXMAN_IMAGE_GET_LINE (
5763         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5764     PIXMAN_IMAGE_GET_LINE (
5765         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5766     PIXMAN_IMAGE_GET_LINE (
5767         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5768
5769     while (height--)
5770     {
5771         src = src_line;
5772         src_line += src_stride;
5773         dst = dst_line;
5774         dst_line += dst_stride;
5775         mask = mask_line;
5776         mask_line += mask_stride;
5777
5778         w = width;
5779
5780         while (w && (unsigned long)dst & 15)
5781         {
5782             uint32_t sa;
5783
5784             s = *src++;
5785             m = (*mask++) >> 24;
5786             d = *dst;
5787
5788             sa = s >> 24;
5789
5790             if (m)
5791             {
5792                 if (sa == 0xff && m == 0xff)
5793                 {
5794                     *dst = s;
5795                 }
5796                 else
5797                 {
5798                     __m128i ms, md, ma, msa;
5799
5800                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5801                     ms = unpack_32_1x128 (s);
5802                     md = unpack_32_1x128 (d);
5803
5804                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5805
5806                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5807                 }
5808             }
5809
5810             dst++;
5811             w--;
5812         }
5813
5814         while (w >= 4)
5815         {
5816             xmm_mask = load_128_unaligned ((__m128i*)mask);
5817
5818             if (!is_transparent (xmm_mask))
5819             {
5820                 xmm_src = load_128_unaligned ((__m128i*)src);
5821
5822                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5823                 {
5824                     save_128_aligned ((__m128i *)dst, xmm_src);
5825                 }
5826                 else
5827                 {
5828                     xmm_dst = load_128_aligned ((__m128i *)dst);
5829
5830                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5831                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5832                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5833
5834                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5835                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5836
5837                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5838                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5839
5840                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5841                 }
5842             }
5843
5844             src += 4;
5845             dst += 4;
5846             mask += 4;
5847             w -= 4;
5848         }
5849
5850         while (w)
5851         {
5852             uint32_t sa;
5853
5854             s = *src++;
5855             m = (*mask++) >> 24;
5856             d = *dst;
5857
5858             sa = s >> 24;
5859
5860             if (m)
5861             {
5862                 if (sa == 0xff && m == 0xff)
5863                 {
5864                     *dst = s;
5865                 }
5866                 else
5867                 {
5868                     __m128i ms, md, ma, msa;
5869
5870                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5871                     ms = unpack_32_1x128 (s);
5872                     md = unpack_32_1x128 (d);
5873
5874                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5875
5876                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5877                 }
5878             }
5879
5880             dst++;
5881             w--;
5882         }
5883     }
5884
5885     _mm_empty ();
5886 }
5887
5888 /* A variant of 'core_combine_over_u_sse2' with minor tweaks */
5889 static force_inline void
5890 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5891                                              const uint32_t* ps,
5892                                              int32_t         w,
5893                                              pixman_fixed_t  vx,
5894                                              pixman_fixed_t  unit_x,
5895                                              pixman_fixed_t  max_vx,
5896                                              pixman_bool_t   fully_transparent_src)
5897 {
5898     uint32_t s, d;
5899     const uint32_t* pm = NULL;
5900
5901     __m128i xmm_dst_lo, xmm_dst_hi;
5902     __m128i xmm_src_lo, xmm_src_hi;
5903     __m128i xmm_alpha_lo, xmm_alpha_hi;
5904
5905     if (fully_transparent_src)
5906         return;
5907
5908     /* Align dst on a 16-byte boundary */
5909     while (w && ((unsigned long)pd & 15))
5910     {
5911         d = *pd;
5912         s = combine1 (ps + (vx >> 16), pm);
5913         vx += unit_x;
5914
5915         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5916         if (pm)
5917             pm++;
5918         w--;
5919     }
5920
5921     while (w >= 4)
5922     {
5923         __m128i tmp;
5924         uint32_t tmp1, tmp2, tmp3, tmp4;
5925
5926         tmp1 = ps[vx >> 16];
5927         vx += unit_x;
5928         tmp2 = ps[vx >> 16];
5929         vx += unit_x;
5930         tmp3 = ps[vx >> 16];
5931         vx += unit_x;
5932         tmp4 = ps[vx >> 16];
5933         vx += unit_x;
5934
5935         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5936
5937         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5938
5939         if (is_opaque (xmm_src_hi))
5940         {
5941             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5942         }
5943         else if (!is_zero (xmm_src_hi))
5944         {
5945             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5946
5947             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5948             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5949
5950             expand_alpha_2x128 (
5951                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5952
5953             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5954                         &xmm_alpha_lo, &xmm_alpha_hi,
5955                         &xmm_dst_lo, &xmm_dst_hi);
5956
5957             /* rebuid the 4 pixel data and save*/
5958             save_128_aligned ((__m128i*)pd,
5959                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5960         }
5961
5962         w -= 4;
5963         pd += 4;
5964         if (pm)
5965             pm += 4;
5966     }
5967
5968     while (w)
5969     {
5970         d = *pd;
5971         s = combine1 (ps + (vx >> 16), pm);
5972         vx += unit_x;
5973
5974         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5975         if (pm)
5976             pm++;
5977
5978         w--;
5979     }
5980     _mm_empty ();
5981 }
5982
5983 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5984                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5985                        uint32_t, uint32_t, COVER)
5986 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5987                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5988                        uint32_t, uint32_t, NONE)
5989 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5990                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5991                        uint32_t, uint32_t, PAD)
5992
5993 static force_inline void
5994 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5995                                                uint32_t *       dst,
5996                                                const uint32_t * src,
5997                                                int32_t          w,
5998                                                pixman_fixed_t   vx,
5999                                                pixman_fixed_t   unit_x,
6000                                                pixman_fixed_t   max_vx,
6001                                                pixman_bool_t    zero_src)
6002 {
6003     __m128i xmm_mask;
6004     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
6005     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6006     __m128i xmm_alpha_lo, xmm_alpha_hi;
6007
6008     if (zero_src || (*mask >> 24) == 0)
6009         return;
6010
6011     xmm_mask = create_mask_16_128 (*mask >> 24);
6012
6013     while (w && (unsigned long)dst & 15)
6014     {
6015         uint32_t s = src[pixman_fixed_to_int (vx)];
6016         vx += unit_x;
6017
6018         if (s)
6019         {
6020             uint32_t d = *dst;
6021
6022             __m128i ms = unpack_32_1x128 (s);
6023             __m128i alpha     = expand_alpha_1x128 (ms);
6024             __m128i dest      = xmm_mask;
6025             __m128i alpha_dst = unpack_32_1x128 (d);
6026
6027             *dst = pack_1x128_32 (
6028                 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6029         }
6030         dst++;
6031         w--;
6032     }
6033
6034     while (w >= 4)
6035     {
6036         uint32_t tmp1, tmp2, tmp3, tmp4;
6037
6038         tmp1 = src[pixman_fixed_to_int (vx)];
6039         vx += unit_x;
6040         tmp2 = src[pixman_fixed_to_int (vx)];
6041         vx += unit_x;
6042         tmp3 = src[pixman_fixed_to_int (vx)];
6043         vx += unit_x;
6044         tmp4 = src[pixman_fixed_to_int (vx)];
6045         vx += unit_x;
6046
6047         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
6048
6049         if (!is_zero (xmm_src))
6050         {
6051             xmm_dst = load_128_aligned ((__m128i*)dst);
6052
6053             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6054             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6055             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6056                                 &xmm_alpha_lo, &xmm_alpha_hi);
6057
6058             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6059                            &xmm_alpha_lo, &xmm_alpha_hi,
6060                            &xmm_mask, &xmm_mask,
6061                            &xmm_dst_lo, &xmm_dst_hi);
6062
6063             save_128_aligned (
6064                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6065         }
6066
6067         dst += 4;
6068         w -= 4;
6069     }
6070
6071     while (w)
6072     {
6073         uint32_t s = src[pixman_fixed_to_int (vx)];
6074         vx += unit_x;
6075
6076         if (s)
6077         {
6078             uint32_t d = *dst;
6079
6080             __m128i ms = unpack_32_1x128 (s);
6081             __m128i alpha = expand_alpha_1x128 (ms);
6082             __m128i mask  = xmm_mask;
6083             __m128i dest  = unpack_32_1x128 (d);
6084
6085             *dst = pack_1x128_32 (
6086                 in_over_1x128 (&ms, &alpha, &mask, &dest));
6087         }
6088
6089         dst++;
6090         w--;
6091     }
6092
6093     _mm_empty ();
6094 }
6095
6096 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6097                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
6098                               uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
6099 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6100                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
6101                               uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
6102 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6103                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
6104                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
6105
6106 static const pixman_fast_path_t sse2_fast_paths[] =
6107 {
6108     /* PIXMAN_OP_OVER */
6109     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6110     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6111     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6112     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6113     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6114     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6115     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6116     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6117     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6118     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6119     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6120     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6121     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6122     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6123     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6124     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6125     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6126     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6127     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6128     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6129     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6130     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6131     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6132     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6133     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6134     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6135     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6136     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6137     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6138     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6139     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6140     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6141     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6142     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6143     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6144     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6145     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6146     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6147     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6148     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6149     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6150     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6151     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6152     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6153     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6154     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6155
6156     /* PIXMAN_OP_OVER_REVERSE */
6157     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6158     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6159
6160     /* PIXMAN_OP_ADD */
6161     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6162     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6163     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6164     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6165     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6166     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6167
6168     /* PIXMAN_OP_SRC */
6169     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6170     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6171     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6172     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6173     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6174     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6175     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6176     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6177     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6178     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6179     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6180     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6181     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6182     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6183
6184     /* PIXMAN_OP_IN */
6185     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6186     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6187     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6188
6189     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6190     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6191     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6192     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6193     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6194     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6195     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6196     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6197     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6198     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6199     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6200     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6201
6202     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6203     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6204     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6205     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6206
6207     { PIXMAN_OP_NONE },
6208 };
6209
6210 static pixman_bool_t
6211 sse2_blt (pixman_implementation_t *imp,
6212           uint32_t *               src_bits,
6213           uint32_t *               dst_bits,
6214           int                      src_stride,
6215           int                      dst_stride,
6216           int                      src_bpp,
6217           int                      dst_bpp,
6218           int                      src_x,
6219           int                      src_y,
6220           int                      dst_x,
6221           int                      dst_y,
6222           int                      width,
6223           int                      height)
6224 {
6225     if (!pixman_blt_sse2 (
6226             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6227             src_x, src_y, dst_x, dst_y, width, height))
6228
6229     {
6230         return _pixman_implementation_blt (
6231             imp->delegate,
6232             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6233             src_x, src_y, dst_x, dst_y, width, height);
6234     }
6235
6236     return TRUE;
6237 }
6238
6239 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6240 __attribute__((__force_align_arg_pointer__))
6241 #endif
6242 static pixman_bool_t
6243 sse2_fill (pixman_implementation_t *imp,
6244            uint32_t *               bits,
6245            int                      stride,
6246            int                      bpp,
6247            int                      x,
6248            int                      y,
6249            int                      width,
6250            int                      height,
6251            uint32_t xor)
6252 {
6253     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
6254     {
6255         return _pixman_implementation_fill (
6256             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
6257     }
6258
6259     return TRUE;
6260 }
6261
6262 static uint32_t *
6263 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6264 {
6265     int w = iter->width;
6266     __m128i ff000000 = mask_ff000000;
6267     uint32_t *dst = iter->buffer;
6268     uint32_t *src = (uint32_t *)iter->bits;
6269
6270     iter->bits += iter->stride;
6271
6272     while (w && ((unsigned long)dst) & 0x0f)
6273     {
6274         *dst++ = (*src++) | 0xff000000;
6275         w--;
6276     }
6277
6278     while (w >= 4)
6279     {
6280         save_128_aligned (
6281             (__m128i *)dst, _mm_or_si128 (
6282                 load_128_unaligned ((__m128i *)src), ff000000));
6283
6284         dst += 4;
6285         src += 4;
6286         w -= 4;
6287     }
6288
6289     while (w)
6290     {
6291         *dst++ = (*src++) | 0xff000000;
6292         w--;
6293     }
6294
6295     return iter->buffer;
6296 }
6297
6298 static uint32_t *
6299 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6300 {
6301     int w = iter->width;
6302     uint32_t *dst = iter->buffer;
6303     uint16_t *src = (uint16_t *)iter->bits;
6304     __m128i ff000000 = mask_ff000000;
6305
6306     iter->bits += iter->stride;
6307
6308     while (w && ((unsigned long)dst) & 0x0f)
6309     {
6310         uint16_t s = *src++;
6311
6312         *dst++ = CONVERT_0565_TO_8888 (s);
6313         w--;
6314     }
6315
6316     while (w >= 8)
6317     {
6318         __m128i lo, hi, s;
6319
6320         s = _mm_loadu_si128 ((__m128i *)src);
6321
6322         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6323         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6324
6325         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6326         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6327
6328         dst += 8;
6329         src += 8;
6330         w -= 8;
6331     }
6332
6333     while (w)
6334     {
6335         uint16_t s = *src++;
6336
6337         *dst++ = CONVERT_0565_TO_8888 (s);
6338         w--;
6339     }
6340
6341     return iter->buffer;
6342 }
6343
6344 static uint32_t *
6345 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6346 {
6347     int w = iter->width;
6348     uint32_t *dst = iter->buffer;
6349     uint8_t *src = iter->bits;
6350     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6351
6352     iter->bits += iter->stride;
6353
6354     while (w && (((unsigned long)dst) & 15))
6355     {
6356         *dst++ = *(src++) << 24;
6357         w--;
6358     }
6359
6360     while (w >= 16)
6361     {
6362         xmm0 = _mm_loadu_si128((__m128i *)src);
6363
6364         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
6365         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
6366         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6367         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6368         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6369         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6370
6371         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
6372         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
6373         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
6374         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6375
6376         dst += 16;
6377         src += 16;
6378         w -= 16;
6379     }
6380
6381     while (w)
6382     {
6383         *dst++ = *(src++) << 24;
6384         w--;
6385     }
6386
6387     return iter->buffer;
6388 }
6389
6390 typedef struct
6391 {
6392     pixman_format_code_t        format;
6393     pixman_iter_get_scanline_t  get_scanline;
6394 } fetcher_info_t;
6395
6396 static const fetcher_info_t fetchers[] =
6397 {
6398     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
6399     { PIXMAN_r5g6b5,            sse2_fetch_r5g6b5 },
6400     { PIXMAN_a8,                sse2_fetch_a8 },
6401     { PIXMAN_null }
6402 };
6403
6404 static void
6405 sse2_src_iter_init (pixman_implementation_t *imp,
6406                     pixman_iter_t *iter,
6407                     pixman_image_t *image,
6408                     int x, int y, int width, int height,
6409                     uint8_t *buffer, iter_flags_t flags)
6410 {
6411 #define FLAGS                                                           \
6412     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
6413
6414     if ((flags & ITER_NARROW)                           &&
6415         (image->common.flags & FLAGS) == FLAGS          &&
6416         x >= 0 && y >= 0                                &&
6417         x + width <= image->bits.width                  &&
6418         y + height <= image->bits.height)
6419     {
6420         const fetcher_info_t *f;
6421
6422         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6423         {
6424             if (image->common.extended_format_code == f->format)
6425             {
6426                 uint8_t *b = (uint8_t *)image->bits.bits;
6427                 int s = image->bits.rowstride * 4;
6428
6429                 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6430                 iter->stride = s;
6431                 iter->width = width;
6432                 iter->buffer = (uint32_t *)buffer;
6433
6434                 iter->get_scanline = f->get_scanline;
6435                 return;
6436             }
6437         }
6438     }
6439
6440     _pixman_implementation_src_iter_init (
6441         imp->delegate, iter, image, x, y, width, height, buffer, flags);
6442 }
6443
6444 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6445 __attribute__((__force_align_arg_pointer__))
6446 #endif
6447 pixman_implementation_t *
6448 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6449 {
6450     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6451
6452     /* SSE2 constants */
6453     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6454     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6455     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6456     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6457     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6458     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6459     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6460     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6461     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6462     mask_0080 = create_mask_16_128 (0x0080);
6463     mask_00ff = create_mask_16_128 (0x00ff);
6464     mask_0101 = create_mask_16_128 (0x0101);
6465     mask_ffff = create_mask_16_128 (0xffff);
6466     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6467     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6468
6469     /* MMX constants */
6470     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
6471     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
6472
6473     mask_x0080 = create_mask_16_64 (0x0080);
6474     mask_x00ff = create_mask_16_64 (0x00ff);
6475     mask_x0101 = create_mask_16_64 (0x0101);
6476     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
6477
6478     _mm_empty ();
6479
6480     /* Set up function pointers */
6481
6482     /* SSE code patch for fbcompose.c */
6483     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6484     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6485     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6486     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6487     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6488     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6489     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6490     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6491     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6492     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6493
6494     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6495
6496     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6497     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6498     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6499     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6500     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6501     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6502     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6503     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6504     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6505     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6506     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6507
6508     imp->blt = sse2_blt;
6509     imp->fill = sse2_fill;
6510
6511     imp->src_iter_init = sse2_src_iter_init;
6512
6513     return imp;
6514 }
6515
6516 #endif /* USE_SSE2 */