pixman/pixman-sse2.c

   1 /*
   2  * Copyright © 2008 Rodrigo Kumpera
   3  * Copyright © 2008 André Tupinambá
   4  *
   5  * Permission to use, copy, modify, distribute, and sell this software and its
   6  * documentation for any purpose is hereby granted without fee, provided that
   7  * the above copyright notice appear in all copies and that both that
   8  * copyright notice and this permission notice appear in supporting
   9  * documentation, and that the name of Red Hat not be used in advertising or
  10  * publicity pertaining to distribution of the software without specific,
  11  * written prior permission.  Red Hat makes no representations about the
  12  * suitability of this software for any purpose.  It is provided "as is"
  13  * without express or implied warranty.
  14  *
  15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  22  * SOFTWARE.
  23  *
  24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
  25  *          André Tupinambá (andrelrt@gmail.com)
  26  *
  27  * Based on work by Owen Taylor and Søren Sandmann
  28  */
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include <mmintrin.h>
  34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
  35 #include <emmintrin.h> /* for SSE2 intrinsics */
  36 #include "pixman-private.h"
  37 #include "pixman-combine32.h"
  38 #include "pixman-fast-path.h"
  39
  40 #if defined(_MSC_VER) && defined(_M_AMD64)
  41 /* Windows 64 doesn't allow MMX to be used, so
  42  * the pixman-x64-mmx-emulation.h file contains
  43  * implementations of those MMX intrinsics that
  44  * are used in the SSE2 implementation.
  45  */
  46 #   include "pixman-x64-mmx-emulation.h"
  47 #endif
  48
  49 #ifdef USE_SSE2
  50
  51 /* --------------------------------------------------------------------
  52  * Locals
  53  */
  54
  55 static __m128i mask_0080;
  56 static __m128i mask_00ff;
  57 static __m128i mask_0101;
  58 static __m128i mask_ffff;
  59 static __m128i mask_ff000000;
  60 static __m128i mask_alpha;
  61
  62 static __m128i mask_565_r;
  63 static __m128i mask_565_g1, mask_565_g2;
  64 static __m128i mask_565_b;
  65 static __m128i mask_red;
  66 static __m128i mask_green;
  67 static __m128i mask_blue;
  68
  69 static __m128i mask_565_fix_rb;
  70 static __m128i mask_565_fix_g;
  71
  72 /* ----------------------------------------------------------------------
  73  * SSE2 Inlines
  74  */
  75 static force_inline __m128i
  76 unpack_32_1x128 (uint32_t data)
  77 {
  78     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
  79 }
  80
  81 static force_inline void
  82 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
  83 {
  84     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
  85     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
  86 }
  87
  88 static force_inline __m128i
  89 unpack_565_to_8888 (__m128i lo)
  90 {
  91     __m128i r, g, b, rb, t;
  92
  93     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
  94     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
  95     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
  96
  97     rb = _mm_or_si128 (r, b);
  98     t  = _mm_and_si128 (rb, mask_565_fix_rb);
  99     t  = _mm_srli_epi32 (t, 5);
 100     rb = _mm_or_si128 (rb, t);
 101
 102     t  = _mm_and_si128 (g, mask_565_fix_g);
 103     t  = _mm_srli_epi32 (t, 6);
 104     g  = _mm_or_si128 (g, t);
 105
 106     return _mm_or_si128 (rb, g);
 107 }
 108
 109 static force_inline void
 110 unpack_565_128_4x128 (__m128i  data,
 111                       __m128i* data0,
 112                       __m128i* data1,
 113                       __m128i* data2,
 114                       __m128i* data3)
 115 {
 116     __m128i lo, hi;
 117
 118     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
 119     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
 120
 121     lo = unpack_565_to_8888 (lo);
 122     hi = unpack_565_to_8888 (hi);
 123
 124     unpack_128_2x128 (lo, data0, data1);
 125     unpack_128_2x128 (hi, data2, data3);
 126 }
 127
 128 static force_inline uint16_t
 129 pack_565_32_16 (uint32_t pixel)
 130 {
 131     return (uint16_t) (((pixel >> 8) & 0xf800) |
 132                        ((pixel >> 5) & 0x07e0) |
 133                        ((pixel >> 3) & 0x001f));
 134 }
 135
 136 static force_inline __m128i
 137 pack_2x128_128 (__m128i lo, __m128i hi)
 138 {
 139     return _mm_packus_epi16 (lo, hi);
 140 }
 141
 142 static force_inline __m128i
 143 pack_565_2x128_128 (__m128i lo, __m128i hi)
 144 {
 145     __m128i data;
 146     __m128i r, g1, g2, b;
 147
 148     data = pack_2x128_128 (lo, hi);
 149
 150     r  = _mm_and_si128 (data, mask_565_r);
 151     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
 152     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
 153     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
 154
 155     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
 156 }
 157
 158 static force_inline __m128i
 159 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
 160 {
 161     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
 162                              pack_565_2x128_128 (*xmm2, *xmm3));
 163 }
 164
 165 static force_inline int
 166 is_opaque (__m128i x)
 167 {
 168     __m128i ffs = _mm_cmpeq_epi8 (x, x);
 169
 170     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
 171 }
 172
 173 static force_inline int
 174 is_zero (__m128i x)
 175 {
 176     return _mm_movemask_epi8 (
 177         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
 178 }
 179
 180 static force_inline int
 181 is_transparent (__m128i x)
 182 {
 183     return (_mm_movemask_epi8 (
 184                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
 185 }
 186
 187 static force_inline __m128i
 188 expand_pixel_32_1x128 (uint32_t data)
 189 {
 190     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
 191 }
 192
 193 static force_inline __m128i
 194 expand_alpha_1x128 (__m128i data)
 195 {
 196     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
 197                                                      _MM_SHUFFLE (3, 3, 3, 3)),
 198                                 _MM_SHUFFLE (3, 3, 3, 3));
 199 }
 200
 201 static force_inline void
 202 expand_alpha_2x128 (__m128i  data_lo,
 203                     __m128i  data_hi,
 204                     __m128i* alpha_lo,
 205                     __m128i* alpha_hi)
 206 {
 207     __m128i lo, hi;
 208
 209     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
 210     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
 211
 212     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
 213     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
 214 }
 215
 216 static force_inline void
 217 expand_alpha_rev_2x128 (__m128i  data_lo,
 218                         __m128i  data_hi,
 219                         __m128i* alpha_lo,
 220                         __m128i* alpha_hi)
 221 {
 222     __m128i lo, hi;
 223
 224     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
 225     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
 226     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
 227     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
 228 }
 229
 230 static force_inline void
 231 pix_multiply_2x128 (__m128i* data_lo,
 232                     __m128i* data_hi,
 233                     __m128i* alpha_lo,
 234                     __m128i* alpha_hi,
 235                     __m128i* ret_lo,
 236                     __m128i* ret_hi)
 237 {
 238     __m128i lo, hi;
 239
 240     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
 241     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
 242     lo = _mm_adds_epu16 (lo, mask_0080);
 243     hi = _mm_adds_epu16 (hi, mask_0080);
 244     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
 245     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
 246 }
 247
 248 static force_inline void
 249 pix_add_multiply_2x128 (__m128i* src_lo,
 250                         __m128i* src_hi,
 251                         __m128i* alpha_dst_lo,
 252                         __m128i* alpha_dst_hi,
 253                         __m128i* dst_lo,
 254                         __m128i* dst_hi,
 255                         __m128i* alpha_src_lo,
 256                         __m128i* alpha_src_hi,
 257                         __m128i* ret_lo,
 258                         __m128i* ret_hi)
 259 {
 260     __m128i t1_lo, t1_hi;
 261     __m128i t2_lo, t2_hi;
 262
 263     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
 264     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
 265
 266     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
 267     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
 268 }
 269
 270 static force_inline void
 271 negate_2x128 (__m128i  data_lo,
 272               __m128i  data_hi,
 273               __m128i* neg_lo,
 274               __m128i* neg_hi)
 275 {
 276     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
 277     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
 278 }
 279
 280 static force_inline void
 281 invert_colors_2x128 (__m128i  data_lo,
 282                      __m128i  data_hi,
 283                      __m128i* inv_lo,
 284                      __m128i* inv_hi)
 285 {
 286     __m128i lo, hi;
 287
 288     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
 289     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
 290     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
 291     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
 292 }
 293
 294 static force_inline void
 295 over_2x128 (__m128i* src_lo,
 296             __m128i* src_hi,
 297             __m128i* alpha_lo,
 298             __m128i* alpha_hi,
 299             __m128i* dst_lo,
 300             __m128i* dst_hi)
 301 {
 302     __m128i t1, t2;
 303
 304     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
 305
 306     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
 307
 308     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
 309     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
 310 }
 311
 312 static force_inline void
 313 over_rev_non_pre_2x128 (__m128i  src_lo,
 314                         __m128i  src_hi,
 315                         __m128i* dst_lo,
 316                         __m128i* dst_hi)
 317 {
 318     __m128i lo, hi;
 319     __m128i alpha_lo, alpha_hi;
 320
 321     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
 322
 323     lo = _mm_or_si128 (alpha_lo, mask_alpha);
 324     hi = _mm_or_si128 (alpha_hi, mask_alpha);
 325
 326     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
 327
 328     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
 329
 330     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
 331 }
 332
 333 static force_inline void
 334 in_over_2x128 (__m128i* src_lo,
 335                __m128i* src_hi,
 336                __m128i* alpha_lo,
 337                __m128i* alpha_hi,
 338                __m128i* mask_lo,
 339                __m128i* mask_hi,
 340                __m128i* dst_lo,
 341                __m128i* dst_hi)
 342 {
 343     __m128i s_lo, s_hi;
 344     __m128i a_lo, a_hi;
 345
 346     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
 347     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
 348
 349     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 350 }
 351
 352 /* load 4 pixels from a 16-byte boundary aligned address */
 353 static force_inline __m128i
 354 load_128_aligned (__m128i* src)
 355 {
 356     return _mm_load_si128 (src);
 357 }
 358
 359 /* load 4 pixels from a unaligned address */
 360 static force_inline __m128i
 361 load_128_unaligned (const __m128i* src)
 362 {
 363     return _mm_loadu_si128 (src);
 364 }
 365
 366 /* save 4 pixels using Write Combining memory on a 16-byte
 367  * boundary aligned address
 368  */
 369 static force_inline void
 370 save_128_write_combining (__m128i* dst,
 371                           __m128i  data)
 372 {
 373     _mm_stream_si128 (dst, data);
 374 }
 375
 376 /* save 4 pixels on a 16-byte boundary aligned address */
 377 static force_inline void
 378 save_128_aligned (__m128i* dst,
 379                   __m128i  data)
 380 {
 381     _mm_store_si128 (dst, data);
 382 }
 383
 384 /* save 4 pixels on a unaligned address */
 385 static force_inline void
 386 save_128_unaligned (__m128i* dst,
 387                     __m128i  data)
 388 {
 389     _mm_storeu_si128 (dst, data);
 390 }
 391
 392 /* ------------------------------------------------------------------
 393  * MMX inlines
 394  */
 395
 396 static force_inline __m128i
 397 load_32_1x128 (uint32_t data)
 398 {
 399     return _mm_cvtsi32_si128 (data);
 400 }
 401
 402 static force_inline __m128i
 403 expand_alpha_rev_1x128 (__m128i data)
 404 {
 405     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 406 }
 407
 408 static force_inline __m128i
 409 expand_pixel_8_1x128 (uint8_t data)
 410 {
 411     return _mm_shufflelo_epi16 (
 412         unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 413 }
 414
 415 static force_inline __m128i
 416 pix_multiply_1x128 (__m128i data,
 417                     __m128i alpha)
 418 {
 419     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
 420                                             mask_0080),
 421                             mask_0101);
 422 }
 423
 424 static force_inline __m128i
 425 pix_add_multiply_1x128 (__m128i* src,
 426                         __m128i* alpha_dst,
 427                         __m128i* dst,
 428                         __m128i* alpha_src)
 429 {
 430     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
 431     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
 432
 433     return _mm_adds_epu8 (t1, t2);
 434 }
 435
 436 static force_inline __m128i
 437 negate_1x128 (__m128i data)
 438 {
 439     return _mm_xor_si128 (data, mask_00ff);
 440 }
 441
 442 static force_inline __m128i
 443 invert_colors_1x128 (__m128i data)
 444 {
 445     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 446 }
 447
 448 static force_inline __m128i
 449 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
 450 {
 451     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
 452 }
 453
 454 static force_inline __m128i
 455 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
 456 {
 457     return over_1x128 (pix_multiply_1x128 (*src, *mask),
 458                        pix_multiply_1x128 (*alpha, *mask),
 459                        *dst);
 460 }
 461
 462 static force_inline __m128i
 463 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
 464 {
 465     __m128i alpha = expand_alpha_1x128 (src);
 466
 467     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
 468                                            _mm_or_si128 (alpha, mask_alpha)),
 469                        alpha,
 470                        dst);
 471 }
 472
 473 static force_inline uint32_t
 474 pack_1x128_32 (__m128i data)
 475 {
 476     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
 477 }
 478
 479 static force_inline __m128i
 480 expand565_16_1x128 (uint16_t pixel)
 481 {
 482     __m128i m = _mm_cvtsi32_si128 (pixel);
 483
 484     m = unpack_565_to_8888 (m);
 485
 486     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
 487 }
 488
 489 /* ----------------------------------------------------------------------------
 490  * Compose Core transformations
 491  */
 492 static force_inline uint32_t
 493 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 494 {
 495     uint8_t a;
 496     __m128i xmms;
 497
 498     a = src >> 24;
 499
 500     if (a == 0xff)
 501     {
 502         return src;
 503     }
 504     else if (src)
 505     {
 506         xmms = unpack_32_1x128 (src);
 507         return pack_1x128_32 (
 508             over_1x128 (xmms, expand_alpha_1x128 (xmms),
 509                         unpack_32_1x128 (dst)));
 510     }
 511
 512     return dst;
 513 }
 514
 515 static force_inline uint32_t
 516 combine1 (const uint32_t *ps, const uint32_t *pm)
 517 {
 518     uint32_t s = *ps;
 519
 520     if (pm)
 521     {
 522         __m128i ms, mm;
 523
 524         mm = unpack_32_1x128 (*pm);
 525         mm = expand_alpha_1x128 (mm);
 526
 527         ms = unpack_32_1x128 (s);
 528         ms = pix_multiply_1x128 (ms, mm);
 529
 530         s = pack_1x128_32 (ms);
 531     }
 532
 533     return s;
 534 }
 535
 536 static force_inline __m128i
 537 combine4 (const __m128i *ps, const __m128i *pm)
 538 {
 539     __m128i xmm_src_lo, xmm_src_hi;
 540     __m128i xmm_msk_lo, xmm_msk_hi;
 541     __m128i s;
 542
 543     if (pm)
 544     {
 545         xmm_msk_lo = load_128_unaligned (pm);
 546
 547         if (is_transparent (xmm_msk_lo))
 548             return _mm_setzero_si128 ();
 549     }
 550
 551     s = load_128_unaligned (ps);
 552
 553     if (pm)
 554     {
 555         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
 556         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
 557
 558         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
 559
 560         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
 561                             &xmm_msk_lo, &xmm_msk_hi,
 562                             &xmm_src_lo, &xmm_src_hi);
 563
 564         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
 565     }
 566
 567     return s;
 568 }
 569
 570 static force_inline void
 571 core_combine_over_u_sse2_mask (uint32_t *         pd,
 572                                const uint32_t*    ps,
 573                                const uint32_t*    pm,
 574                                int                w)
 575 {
 576     uint32_t s, d;
 577
 578     /* Align dst on a 16-byte boundary */
 579     while (w && ((unsigned long)pd & 15))
 580     {
 581         d = *pd;
 582         s = combine1 (ps, pm);
 583
 584         if (s)
 585             *pd = core_combine_over_u_pixel_sse2 (s, d);
 586         pd++;
 587         ps++;
 588         pm++;
 589         w--;
 590     }
 591
 592     while (w >= 4)
 593     {
 594         __m128i mask = load_128_unaligned ((__m128i *)pm);
 595
 596         if (!is_zero (mask))
 597         {
 598             __m128i src;
 599             __m128i src_hi, src_lo;
 600             __m128i mask_hi, mask_lo;
 601             __m128i alpha_hi, alpha_lo;
 602
 603             src = load_128_unaligned ((__m128i *)ps);
 604
 605             if (is_opaque (_mm_and_si128 (src, mask)))
 606             {
 607                 save_128_aligned ((__m128i *)pd, src);
 608             }
 609             else
 610             {
 611                 __m128i dst = load_128_aligned ((__m128i *)pd);
 612                 __m128i dst_hi, dst_lo;
 613
 614                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
 615                 unpack_128_2x128 (src, &src_lo, &src_hi);
 616
 617                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
 618                 pix_multiply_2x128 (&src_lo, &src_hi,
 619                                     &mask_lo, &mask_hi,
 620                                     &src_lo, &src_hi);
 621
 622                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
 623
 624                 expand_alpha_2x128 (src_lo, src_hi,
 625                                     &alpha_lo, &alpha_hi);
 626
 627                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
 628                             &dst_lo, &dst_hi);
 629
 630                 save_128_aligned (
 631                     (__m128i *)pd,
 632                     pack_2x128_128 (dst_lo, dst_hi));
 633             }
 634         }
 635
 636         pm += 4;
 637         ps += 4;
 638         pd += 4;
 639         w -= 4;
 640     }
 641     while (w)
 642     {
 643         d = *pd;
 644         s = combine1 (ps, pm);
 645
 646         if (s)
 647             *pd = core_combine_over_u_pixel_sse2 (s, d);
 648         pd++;
 649         ps++;
 650         pm++;
 651
 652         w--;
 653     }
 654 }
 655
 656 static force_inline void
 657 core_combine_over_u_sse2_no_mask (uint32_t *      pd,
 658                                   const uint32_t*    ps,
 659                                   int                w)
 660 {
 661     uint32_t s, d;
 662
 663     /* Align dst on a 16-byte boundary */
 664     while (w && ((unsigned long)pd & 15))
 665     {
 666         d = *pd;
 667         s = *ps;
 668
 669         if (s)
 670             *pd = core_combine_over_u_pixel_sse2 (s, d);
 671         pd++;
 672         ps++;
 673         w--;
 674     }
 675
 676     while (w >= 4)
 677     {
 678         __m128i src;
 679         __m128i src_hi, src_lo, dst_hi, dst_lo;
 680         __m128i alpha_hi, alpha_lo;
 681
 682         src = load_128_unaligned ((__m128i *)ps);
 683
 684         if (!is_zero (src))
 685         {
 686             if (is_opaque (src))
 687             {
 688                 save_128_aligned ((__m128i *)pd, src);
 689             }
 690             else
 691             {
 692                 __m128i dst = load_128_aligned ((__m128i *)pd);
 693
 694                 unpack_128_2x128 (src, &src_lo, &src_hi);
 695                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
 696
 697                 expand_alpha_2x128 (src_lo, src_hi,
 698                                     &alpha_lo, &alpha_hi);
 699                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
 700                             &dst_lo, &dst_hi);
 701
 702                 save_128_aligned (
 703                     (__m128i *)pd,
 704                     pack_2x128_128 (dst_lo, dst_hi));
 705             }
 706         }
 707
 708         ps += 4;
 709         pd += 4;
 710         w -= 4;
 711     }
 712     while (w)
 713     {
 714         d = *pd;
 715         s = *ps;
 716
 717         if (s)
 718             *pd = core_combine_over_u_pixel_sse2 (s, d);
 719         pd++;
 720         ps++;
 721
 722         w--;
 723     }
 724 }
 725
 726 static force_inline void
 727 sse2_combine_over_u (pixman_implementation_t *imp,
 728                      pixman_op_t              op,
 729                      uint32_t *               pd,
 730                      const uint32_t *         ps,
 731                      const uint32_t *         pm,
 732                      int                      w)
 733 {
 734     if (pm)
 735         core_combine_over_u_sse2_mask (pd, ps, pm, w);
 736     else
 737         core_combine_over_u_sse2_no_mask (pd, ps, w);
 738 }
 739
 740 static void
 741 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
 742                              pixman_op_t              op,
 743                              uint32_t *               pd,
 744                              const uint32_t *         ps,
 745                              const uint32_t *         pm,
 746                              int                      w)
 747 {
 748     uint32_t s, d;
 749
 750     __m128i xmm_dst_lo, xmm_dst_hi;
 751     __m128i xmm_src_lo, xmm_src_hi;
 752     __m128i xmm_alpha_lo, xmm_alpha_hi;
 753
 754     /* Align dst on a 16-byte boundary */
 755     while (w &&
 756            ((unsigned long)pd & 15))
 757     {
 758         d = *pd;
 759         s = combine1 (ps, pm);
 760
 761         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
 762         w--;
 763         ps++;
 764         if (pm)
 765             pm++;
 766     }
 767
 768     while (w >= 4)
 769     {
 770         /* I'm loading unaligned because I'm not sure
 771          * about the address alignment.
 772          */
 773         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 774         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 775
 776         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 777         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 778
 779         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
 780                             &xmm_alpha_lo, &xmm_alpha_hi);
 781
 782         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 783                     &xmm_alpha_lo, &xmm_alpha_hi,
 784                     &xmm_src_lo, &xmm_src_hi);
 785
 786         /* rebuid the 4 pixel data and save*/
 787         save_128_aligned ((__m128i*)pd,
 788                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
 789
 790         w -= 4;
 791         ps += 4;
 792         pd += 4;
 793
 794         if (pm)
 795             pm += 4;
 796     }
 797
 798     while (w)
 799     {
 800         d = *pd;
 801         s = combine1 (ps, pm);
 802
 803         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
 804         ps++;
 805         w--;
 806         if (pm)
 807             pm++;
 808     }
 809 }
 810
 811 static force_inline uint32_t
 812 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
 813 {
 814     uint32_t maska = src >> 24;
 815
 816     if (maska == 0)
 817     {
 818         return 0;
 819     }
 820     else if (maska != 0xff)
 821     {
 822         return pack_1x128_32 (
 823             pix_multiply_1x128 (unpack_32_1x128 (dst),
 824                                 expand_alpha_1x128 (unpack_32_1x128 (src))));
 825     }
 826
 827     return dst;
 828 }
 829
 830 static void
 831 sse2_combine_in_u (pixman_implementation_t *imp,
 832                    pixman_op_t              op,
 833                    uint32_t *               pd,
 834                    const uint32_t *         ps,
 835                    const uint32_t *         pm,
 836                    int                      w)
 837 {
 838     uint32_t s, d;
 839
 840     __m128i xmm_src_lo, xmm_src_hi;
 841     __m128i xmm_dst_lo, xmm_dst_hi;
 842
 843     while (w && ((unsigned long) pd & 15))
 844     {
 845         s = combine1 (ps, pm);
 846         d = *pd;
 847
 848         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
 849         w--;
 850         ps++;
 851         if (pm)
 852             pm++;
 853     }
 854
 855     while (w >= 4)
 856     {
 857         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 858         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 859
 860         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 861         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 862
 863         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 864         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
 865                             &xmm_dst_lo, &xmm_dst_hi,
 866                             &xmm_dst_lo, &xmm_dst_hi);
 867
 868         save_128_aligned ((__m128i*)pd,
 869                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 870
 871         ps += 4;
 872         pd += 4;
 873         w -= 4;
 874         if (pm)
 875             pm += 4;
 876     }
 877
 878     while (w)
 879     {
 880         s = combine1 (ps, pm);
 881         d = *pd;
 882
 883         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
 884         w--;
 885         ps++;
 886         if (pm)
 887             pm++;
 888     }
 889 }
 890
 891 static void
 892 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
 893                            pixman_op_t              op,
 894                            uint32_t *               pd,
 895                            const uint32_t *         ps,
 896                            const uint32_t *         pm,
 897                            int                      w)
 898 {
 899     uint32_t s, d;
 900
 901     __m128i xmm_src_lo, xmm_src_hi;
 902     __m128i xmm_dst_lo, xmm_dst_hi;
 903
 904     while (w && ((unsigned long) pd & 15))
 905     {
 906         s = combine1 (ps, pm);
 907         d = *pd;
 908
 909         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
 910         ps++;
 911         w--;
 912         if (pm)
 913             pm++;
 914     }
 915
 916     while (w >= 4)
 917     {
 918         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 919         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 920
 921         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 922         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 923
 924         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 925         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 926                             &xmm_src_lo, &xmm_src_hi,
 927                             &xmm_dst_lo, &xmm_dst_hi);
 928
 929         save_128_aligned (
 930             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 931
 932         ps += 4;
 933         pd += 4;
 934         w -= 4;
 935         if (pm)
 936             pm += 4;
 937     }
 938
 939     while (w)
 940     {
 941         s = combine1 (ps, pm);
 942         d = *pd;
 943
 944         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
 945         w--;
 946         ps++;
 947         if (pm)
 948             pm++;
 949     }
 950 }
 951
 952 static void
 953 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
 954                             pixman_op_t              op,
 955                             uint32_t *               pd,
 956                             const uint32_t *         ps,
 957                             const uint32_t *         pm,
 958                             int                      w)
 959 {
 960     while (w && ((unsigned long) pd & 15))
 961     {
 962         uint32_t s = combine1 (ps, pm);
 963         uint32_t d = *pd;
 964
 965         *pd++ = pack_1x128_32 (
 966             pix_multiply_1x128 (
 967                 unpack_32_1x128 (d), negate_1x128 (
 968                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
 969
 970         if (pm)
 971             pm++;
 972         ps++;
 973         w--;
 974     }
 975
 976     while (w >= 4)
 977     {
 978         __m128i xmm_src_lo, xmm_src_hi;
 979         __m128i xmm_dst_lo, xmm_dst_hi;
 980
 981         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 982         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 983
 984         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 985         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 986
 987         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 988         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 989
 990         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 991                             &xmm_src_lo, &xmm_src_hi,
 992                             &xmm_dst_lo, &xmm_dst_hi);
 993
 994         save_128_aligned (
 995             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 996
 997         ps += 4;
 998         pd += 4;
 999         if (pm)
1000             pm += 4;
1001
1002         w -= 4;
1003     }
1004
1005     while (w)
1006     {
1007         uint32_t s = combine1 (ps, pm);
1008         uint32_t d = *pd;
1009
1010         *pd++ = pack_1x128_32 (
1011             pix_multiply_1x128 (
1012                 unpack_32_1x128 (d), negate_1x128 (
1013                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
1014         ps++;
1015         if (pm)
1016             pm++;
1017         w--;
1018     }
1019 }
1020
1021 static void
1022 sse2_combine_out_u (pixman_implementation_t *imp,
1023                     pixman_op_t              op,
1024                     uint32_t *               pd,
1025                     const uint32_t *         ps,
1026                     const uint32_t *         pm,
1027                     int                      w)
1028 {
1029     while (w && ((unsigned long) pd & 15))
1030     {
1031         uint32_t s = combine1 (ps, pm);
1032         uint32_t d = *pd;
1033
1034         *pd++ = pack_1x128_32 (
1035             pix_multiply_1x128 (
1036                 unpack_32_1x128 (s), negate_1x128 (
1037                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1038         w--;
1039         ps++;
1040         if (pm)
1041             pm++;
1042     }
1043
1044     while (w >= 4)
1045     {
1046         __m128i xmm_src_lo, xmm_src_hi;
1047         __m128i xmm_dst_lo, xmm_dst_hi;
1048
1049         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1050         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1051
1052         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1053         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1054
1055         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1056         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1057
1058         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1059                             &xmm_dst_lo, &xmm_dst_hi,
1060                             &xmm_dst_lo, &xmm_dst_hi);
1061
1062         save_128_aligned (
1063             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1064
1065         ps += 4;
1066         pd += 4;
1067         w -= 4;
1068         if (pm)
1069             pm += 4;
1070     }
1071
1072     while (w)
1073     {
1074         uint32_t s = combine1 (ps, pm);
1075         uint32_t d = *pd;
1076
1077         *pd++ = pack_1x128_32 (
1078             pix_multiply_1x128 (
1079                 unpack_32_1x128 (s), negate_1x128 (
1080                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1081         w--;
1082         ps++;
1083         if (pm)
1084             pm++;
1085     }
1086 }
1087
1088 static force_inline uint32_t
1089 core_combine_atop_u_pixel_sse2 (uint32_t src,
1090                                 uint32_t dst)
1091 {
1092     __m128i s = unpack_32_1x128 (src);
1093     __m128i d = unpack_32_1x128 (dst);
1094
1095     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1096     __m128i da = expand_alpha_1x128 (d);
1097
1098     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1099 }
1100
1101 static void
1102 sse2_combine_atop_u (pixman_implementation_t *imp,
1103                      pixman_op_t              op,
1104                      uint32_t *               pd,
1105                      const uint32_t *         ps,
1106                      const uint32_t *         pm,
1107                      int                      w)
1108 {
1109     uint32_t s, d;
1110
1111     __m128i xmm_src_lo, xmm_src_hi;
1112     __m128i xmm_dst_lo, xmm_dst_hi;
1113     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1114     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1115
1116     while (w && ((unsigned long) pd & 15))
1117     {
1118         s = combine1 (ps, pm);
1119         d = *pd;
1120
1121         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1122         w--;
1123         ps++;
1124         if (pm)
1125             pm++;
1126     }
1127
1128     while (w >= 4)
1129     {
1130         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1131         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1132
1133         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1134         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1135
1136         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1137                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1138         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1139                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1140
1141         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1142                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1143
1144         pix_add_multiply_2x128 (
1145             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1146             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1147             &xmm_dst_lo, &xmm_dst_hi);
1148
1149         save_128_aligned (
1150             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1151
1152         ps += 4;
1153         pd += 4;
1154         w -= 4;
1155         if (pm)
1156             pm += 4;
1157     }
1158
1159     while (w)
1160     {
1161         s = combine1 (ps, pm);
1162         d = *pd;
1163
1164         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1165         w--;
1166         ps++;
1167         if (pm)
1168             pm++;
1169     }
1170 }
1171
1172 static force_inline uint32_t
1173 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1174                                         uint32_t dst)
1175 {
1176     __m128i s = unpack_32_1x128 (src);
1177     __m128i d = unpack_32_1x128 (dst);
1178
1179     __m128i sa = expand_alpha_1x128 (s);
1180     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1181
1182     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1183 }
1184
1185 static void
1186 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1187                              pixman_op_t              op,
1188                              uint32_t *               pd,
1189                              const uint32_t *         ps,
1190                              const uint32_t *         pm,
1191                              int                      w)
1192 {
1193     uint32_t s, d;
1194
1195     __m128i xmm_src_lo, xmm_src_hi;
1196     __m128i xmm_dst_lo, xmm_dst_hi;
1197     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1198     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1199
1200     while (w && ((unsigned long) pd & 15))
1201     {
1202         s = combine1 (ps, pm);
1203         d = *pd;
1204
1205         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1206         ps++;
1207         w--;
1208         if (pm)
1209             pm++;
1210     }
1211
1212     while (w >= 4)
1213     {
1214         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1215         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1216
1217         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1218         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1219
1220         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1221                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1222         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1223                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1224
1225         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1226                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1227
1228         pix_add_multiply_2x128 (
1229             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1230             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1231             &xmm_dst_lo, &xmm_dst_hi);
1232
1233         save_128_aligned (
1234             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1235
1236         ps += 4;
1237         pd += 4;
1238         w -= 4;
1239         if (pm)
1240             pm += 4;
1241     }
1242
1243     while (w)
1244     {
1245         s = combine1 (ps, pm);
1246         d = *pd;
1247
1248         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1249         ps++;
1250         w--;
1251         if (pm)
1252             pm++;
1253     }
1254 }
1255
1256 static force_inline uint32_t
1257 core_combine_xor_u_pixel_sse2 (uint32_t src,
1258                                uint32_t dst)
1259 {
1260     __m128i s = unpack_32_1x128 (src);
1261     __m128i d = unpack_32_1x128 (dst);
1262
1263     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1264     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1265
1266     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1267 }
1268
1269 static void
1270 sse2_combine_xor_u (pixman_implementation_t *imp,
1271                     pixman_op_t              op,
1272                     uint32_t *               dst,
1273                     const uint32_t *         src,
1274                     const uint32_t *         mask,
1275                     int                      width)
1276 {
1277     int w = width;
1278     uint32_t s, d;
1279     uint32_t* pd = dst;
1280     const uint32_t* ps = src;
1281     const uint32_t* pm = mask;
1282
1283     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1284     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1285     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1286     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1287
1288     while (w && ((unsigned long) pd & 15))
1289     {
1290         s = combine1 (ps, pm);
1291         d = *pd;
1292
1293         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1294         w--;
1295         ps++;
1296         if (pm)
1297             pm++;
1298     }
1299
1300     while (w >= 4)
1301     {
1302         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1303         xmm_dst = load_128_aligned ((__m128i*) pd);
1304
1305         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1306         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1307
1308         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1309                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1310         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1311                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1312
1313         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1314                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1315         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1316                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1317
1318         pix_add_multiply_2x128 (
1319             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1320             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1321             &xmm_dst_lo, &xmm_dst_hi);
1322
1323         save_128_aligned (
1324             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1325
1326         ps += 4;
1327         pd += 4;
1328         w -= 4;
1329         if (pm)
1330             pm += 4;
1331     }
1332
1333     while (w)
1334     {
1335         s = combine1 (ps, pm);
1336         d = *pd;
1337
1338         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1339         w--;
1340         ps++;
1341         if (pm)
1342             pm++;
1343     }
1344 }
1345
1346 static force_inline void
1347 sse2_combine_add_u (pixman_implementation_t *imp,
1348                     pixman_op_t              op,
1349                     uint32_t *               dst,
1350                     const uint32_t *         src,
1351                     const uint32_t *         mask,
1352                     int                      width)
1353 {
1354     int w = width;
1355     uint32_t s, d;
1356     uint32_t* pd = dst;
1357     const uint32_t* ps = src;
1358     const uint32_t* pm = mask;
1359
1360     while (w && (unsigned long)pd & 15)
1361     {
1362         s = combine1 (ps, pm);
1363         d = *pd;
1364
1365         ps++;
1366         if (pm)
1367             pm++;
1368         *pd++ = _mm_cvtsi128_si32 (
1369             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1370         w--;
1371     }
1372
1373     while (w >= 4)
1374     {
1375         __m128i s;
1376
1377         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1378
1379         save_128_aligned (
1380             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1381
1382         pd += 4;
1383         ps += 4;
1384         if (pm)
1385             pm += 4;
1386         w -= 4;
1387     }
1388
1389     while (w--)
1390     {
1391         s = combine1 (ps, pm);
1392         d = *pd;
1393
1394         ps++;
1395         *pd++ = _mm_cvtsi128_si32 (
1396             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1397         if (pm)
1398             pm++;
1399     }
1400 }
1401
1402 static force_inline uint32_t
1403 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1404                                     uint32_t dst)
1405 {
1406     __m128i ms = unpack_32_1x128 (src);
1407     __m128i md = unpack_32_1x128 (dst);
1408     uint32_t sa = src >> 24;
1409     uint32_t da = ~dst >> 24;
1410
1411     if (sa > da)
1412     {
1413         ms = pix_multiply_1x128 (
1414             ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1415     }
1416
1417     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1418 }
1419
1420 static void
1421 sse2_combine_saturate_u (pixman_implementation_t *imp,
1422                          pixman_op_t              op,
1423                          uint32_t *               pd,
1424                          const uint32_t *         ps,
1425                          const uint32_t *         pm,
1426                          int                      w)
1427 {
1428     uint32_t s, d;
1429
1430     uint32_t pack_cmp;
1431     __m128i xmm_src, xmm_dst;
1432
1433     while (w && (unsigned long)pd & 15)
1434     {
1435         s = combine1 (ps, pm);
1436         d = *pd;
1437
1438         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1439         w--;
1440         ps++;
1441         if (pm)
1442             pm++;
1443     }
1444
1445     while (w >= 4)
1446     {
1447         xmm_dst = load_128_aligned  ((__m128i*)pd);
1448         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1449
1450         pack_cmp = _mm_movemask_epi8 (
1451             _mm_cmpgt_epi32 (
1452                 _mm_srli_epi32 (xmm_src, 24),
1453                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1454
1455         /* if some alpha src is grater than respective ~alpha dst */
1456         if (pack_cmp)
1457         {
1458             s = combine1 (ps++, pm);
1459             d = *pd;
1460             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1461             if (pm)
1462                 pm++;
1463
1464             s = combine1 (ps++, pm);
1465             d = *pd;
1466             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1467             if (pm)
1468                 pm++;
1469
1470             s = combine1 (ps++, pm);
1471             d = *pd;
1472             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1473             if (pm)
1474                 pm++;
1475
1476             s = combine1 (ps++, pm);
1477             d = *pd;
1478             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1479             if (pm)
1480                 pm++;
1481         }
1482         else
1483         {
1484             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1485
1486             pd += 4;
1487             ps += 4;
1488             if (pm)
1489                 pm += 4;
1490         }
1491
1492         w -= 4;
1493     }
1494
1495     while (w--)
1496     {
1497         s = combine1 (ps, pm);
1498         d = *pd;
1499
1500         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1501         ps++;
1502         if (pm)
1503             pm++;
1504     }
1505 }
1506
1507 static void
1508 sse2_combine_src_ca (pixman_implementation_t *imp,
1509                      pixman_op_t              op,
1510                      uint32_t *               pd,
1511                      const uint32_t *         ps,
1512                      const uint32_t *         pm,
1513                      int                      w)
1514 {
1515     uint32_t s, m;
1516
1517     __m128i xmm_src_lo, xmm_src_hi;
1518     __m128i xmm_mask_lo, xmm_mask_hi;
1519     __m128i xmm_dst_lo, xmm_dst_hi;
1520
1521     while (w && (unsigned long)pd & 15)
1522     {
1523         s = *ps++;
1524         m = *pm++;
1525         *pd++ = pack_1x128_32 (
1526             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1527         w--;
1528     }
1529
1530     while (w >= 4)
1531     {
1532         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1533         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1534
1535         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1536         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1537
1538         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1539                             &xmm_mask_lo, &xmm_mask_hi,
1540                             &xmm_dst_lo, &xmm_dst_hi);
1541
1542         save_128_aligned (
1543             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1544
1545         ps += 4;
1546         pd += 4;
1547         pm += 4;
1548         w -= 4;
1549     }
1550
1551     while (w)
1552     {
1553         s = *ps++;
1554         m = *pm++;
1555         *pd++ = pack_1x128_32 (
1556             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1557         w--;
1558     }
1559 }
1560
1561 static force_inline uint32_t
1562 core_combine_over_ca_pixel_sse2 (uint32_t src,
1563                                  uint32_t mask,
1564                                  uint32_t dst)
1565 {
1566     __m128i s = unpack_32_1x128 (src);
1567     __m128i expAlpha = expand_alpha_1x128 (s);
1568     __m128i unpk_mask = unpack_32_1x128 (mask);
1569     __m128i unpk_dst  = unpack_32_1x128 (dst);
1570
1571     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1572 }
1573
1574 static void
1575 sse2_combine_over_ca (pixman_implementation_t *imp,
1576                       pixman_op_t              op,
1577                       uint32_t *               pd,
1578                       const uint32_t *         ps,
1579                       const uint32_t *         pm,
1580                       int                      w)
1581 {
1582     uint32_t s, m, d;
1583
1584     __m128i xmm_alpha_lo, xmm_alpha_hi;
1585     __m128i xmm_src_lo, xmm_src_hi;
1586     __m128i xmm_dst_lo, xmm_dst_hi;
1587     __m128i xmm_mask_lo, xmm_mask_hi;
1588
1589     while (w && (unsigned long)pd & 15)
1590     {
1591         s = *ps++;
1592         m = *pm++;
1593         d = *pd;
1594
1595         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1596         w--;
1597     }
1598
1599     while (w >= 4)
1600     {
1601         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1602         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1603         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1604
1605         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1606         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1607         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1608
1609         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1610                             &xmm_alpha_lo, &xmm_alpha_hi);
1611
1612         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1613                        &xmm_alpha_lo, &xmm_alpha_hi,
1614                        &xmm_mask_lo, &xmm_mask_hi,
1615                        &xmm_dst_lo, &xmm_dst_hi);
1616
1617         save_128_aligned (
1618             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1619
1620         ps += 4;
1621         pd += 4;
1622         pm += 4;
1623         w -= 4;
1624     }
1625
1626     while (w)
1627     {
1628         s = *ps++;
1629         m = *pm++;
1630         d = *pd;
1631
1632         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1633         w--;
1634     }
1635 }
1636
1637 static force_inline uint32_t
1638 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1639                                          uint32_t mask,
1640                                          uint32_t dst)
1641 {
1642     __m128i d = unpack_32_1x128 (dst);
1643
1644     return pack_1x128_32 (
1645         over_1x128 (d, expand_alpha_1x128 (d),
1646                     pix_multiply_1x128 (unpack_32_1x128 (src),
1647                                         unpack_32_1x128 (mask))));
1648 }
1649
1650 static void
1651 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1652                               pixman_op_t              op,
1653                               uint32_t *               pd,
1654                               const uint32_t *         ps,
1655                               const uint32_t *         pm,
1656                               int                      w)
1657 {
1658     uint32_t s, m, d;
1659
1660     __m128i xmm_alpha_lo, xmm_alpha_hi;
1661     __m128i xmm_src_lo, xmm_src_hi;
1662     __m128i xmm_dst_lo, xmm_dst_hi;
1663     __m128i xmm_mask_lo, xmm_mask_hi;
1664
1665     while (w && (unsigned long)pd & 15)
1666     {
1667         s = *ps++;
1668         m = *pm++;
1669         d = *pd;
1670
1671         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1672         w--;
1673     }
1674
1675     while (w >= 4)
1676     {
1677         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1678         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1679         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1680
1681         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1682         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1683         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1684
1685         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1686                             &xmm_alpha_lo, &xmm_alpha_hi);
1687         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1688                             &xmm_mask_lo, &xmm_mask_hi,
1689                             &xmm_mask_lo, &xmm_mask_hi);
1690
1691         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1692                     &xmm_alpha_lo, &xmm_alpha_hi,
1693                     &xmm_mask_lo, &xmm_mask_hi);
1694
1695         save_128_aligned (
1696             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1697
1698         ps += 4;
1699         pd += 4;
1700         pm += 4;
1701         w -= 4;
1702     }
1703
1704     while (w)
1705     {
1706         s = *ps++;
1707         m = *pm++;
1708         d = *pd;
1709
1710         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1711         w--;
1712     }
1713 }
1714
1715 static void
1716 sse2_combine_in_ca (pixman_implementation_t *imp,
1717                     pixman_op_t              op,
1718                     uint32_t *               pd,
1719                     const uint32_t *         ps,
1720                     const uint32_t *         pm,
1721                     int                      w)
1722 {
1723     uint32_t s, m, d;
1724
1725     __m128i xmm_alpha_lo, xmm_alpha_hi;
1726     __m128i xmm_src_lo, xmm_src_hi;
1727     __m128i xmm_dst_lo, xmm_dst_hi;
1728     __m128i xmm_mask_lo, xmm_mask_hi;
1729
1730     while (w && (unsigned long)pd & 15)
1731     {
1732         s = *ps++;
1733         m = *pm++;
1734         d = *pd;
1735
1736         *pd++ = pack_1x128_32 (
1737             pix_multiply_1x128 (
1738                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1739                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1740
1741         w--;
1742     }
1743
1744     while (w >= 4)
1745     {
1746         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1747         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1748         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1749
1750         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1751         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1752         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1753
1754         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1755                             &xmm_alpha_lo, &xmm_alpha_hi);
1756
1757         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1758                             &xmm_mask_lo, &xmm_mask_hi,
1759                             &xmm_dst_lo, &xmm_dst_hi);
1760
1761         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1762                             &xmm_alpha_lo, &xmm_alpha_hi,
1763                             &xmm_dst_lo, &xmm_dst_hi);
1764
1765         save_128_aligned (
1766             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1767
1768         ps += 4;
1769         pd += 4;
1770         pm += 4;
1771         w -= 4;
1772     }
1773
1774     while (w)
1775     {
1776         s = *ps++;
1777         m = *pm++;
1778         d = *pd;
1779
1780         *pd++ = pack_1x128_32 (
1781             pix_multiply_1x128 (
1782                 pix_multiply_1x128 (
1783                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1784                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1785
1786         w--;
1787     }
1788 }
1789
1790 static void
1791 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1792                             pixman_op_t              op,
1793                             uint32_t *               pd,
1794                             const uint32_t *         ps,
1795                             const uint32_t *         pm,
1796                             int                      w)
1797 {
1798     uint32_t s, m, d;
1799
1800     __m128i xmm_alpha_lo, xmm_alpha_hi;
1801     __m128i xmm_src_lo, xmm_src_hi;
1802     __m128i xmm_dst_lo, xmm_dst_hi;
1803     __m128i xmm_mask_lo, xmm_mask_hi;
1804
1805     while (w && (unsigned long)pd & 15)
1806     {
1807         s = *ps++;
1808         m = *pm++;
1809         d = *pd;
1810
1811         *pd++ = pack_1x128_32 (
1812             pix_multiply_1x128 (
1813                 unpack_32_1x128 (d),
1814                 pix_multiply_1x128 (unpack_32_1x128 (m),
1815                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1816         w--;
1817     }
1818
1819     while (w >= 4)
1820     {
1821         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1822         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1823         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1824
1825         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1826         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1827         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1828
1829         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1830                             &xmm_alpha_lo, &xmm_alpha_hi);
1831         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1832                             &xmm_alpha_lo, &xmm_alpha_hi,
1833                             &xmm_alpha_lo, &xmm_alpha_hi);
1834
1835         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1836                             &xmm_alpha_lo, &xmm_alpha_hi,
1837                             &xmm_dst_lo, &xmm_dst_hi);
1838
1839         save_128_aligned (
1840             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1841
1842         ps += 4;
1843         pd += 4;
1844         pm += 4;
1845         w -= 4;
1846     }
1847
1848     while (w)
1849     {
1850         s = *ps++;
1851         m = *pm++;
1852         d = *pd;
1853
1854         *pd++ = pack_1x128_32 (
1855             pix_multiply_1x128 (
1856                 unpack_32_1x128 (d),
1857                 pix_multiply_1x128 (unpack_32_1x128 (m),
1858                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1859         w--;
1860     }
1861 }
1862
1863 static void
1864 sse2_combine_out_ca (pixman_implementation_t *imp,
1865                      pixman_op_t              op,
1866                      uint32_t *               pd,
1867                      const uint32_t *         ps,
1868                      const uint32_t *         pm,
1869                      int                      w)
1870 {
1871     uint32_t s, m, d;
1872
1873     __m128i xmm_alpha_lo, xmm_alpha_hi;
1874     __m128i xmm_src_lo, xmm_src_hi;
1875     __m128i xmm_dst_lo, xmm_dst_hi;
1876     __m128i xmm_mask_lo, xmm_mask_hi;
1877
1878     while (w && (unsigned long)pd & 15)
1879     {
1880         s = *ps++;
1881         m = *pm++;
1882         d = *pd;
1883
1884         *pd++ = pack_1x128_32 (
1885             pix_multiply_1x128 (
1886                 pix_multiply_1x128 (
1887                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1888                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1889         w--;
1890     }
1891
1892     while (w >= 4)
1893     {
1894         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1895         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1896         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1897
1898         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1899         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1900         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1901
1902         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1903                             &xmm_alpha_lo, &xmm_alpha_hi);
1904         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1905                       &xmm_alpha_lo, &xmm_alpha_hi);
1906
1907         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1908                             &xmm_mask_lo, &xmm_mask_hi,
1909                             &xmm_dst_lo, &xmm_dst_hi);
1910         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1911                             &xmm_alpha_lo, &xmm_alpha_hi,
1912                             &xmm_dst_lo, &xmm_dst_hi);
1913
1914         save_128_aligned (
1915             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1916
1917         ps += 4;
1918         pd += 4;
1919         pm += 4;
1920         w -= 4;
1921     }
1922
1923     while (w)
1924     {
1925         s = *ps++;
1926         m = *pm++;
1927         d = *pd;
1928
1929         *pd++ = pack_1x128_32 (
1930             pix_multiply_1x128 (
1931                 pix_multiply_1x128 (
1932                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1933                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1934
1935         w--;
1936     }
1937 }
1938
1939 static void
1940 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1941                              pixman_op_t              op,
1942                              uint32_t *               pd,
1943                              const uint32_t *         ps,
1944                              const uint32_t *         pm,
1945                              int                      w)
1946 {
1947     uint32_t s, m, d;
1948
1949     __m128i xmm_alpha_lo, xmm_alpha_hi;
1950     __m128i xmm_src_lo, xmm_src_hi;
1951     __m128i xmm_dst_lo, xmm_dst_hi;
1952     __m128i xmm_mask_lo, xmm_mask_hi;
1953
1954     while (w && (unsigned long)pd & 15)
1955     {
1956         s = *ps++;
1957         m = *pm++;
1958         d = *pd;
1959
1960         *pd++ = pack_1x128_32 (
1961             pix_multiply_1x128 (
1962                 unpack_32_1x128 (d),
1963                 negate_1x128 (pix_multiply_1x128 (
1964                                  unpack_32_1x128 (m),
1965                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1966         w--;
1967     }
1968
1969     while (w >= 4)
1970     {
1971         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1972         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1973         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1974
1975         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1976         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1977         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1978
1979         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1980                             &xmm_alpha_lo, &xmm_alpha_hi);
1981
1982         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1983                             &xmm_alpha_lo, &xmm_alpha_hi,
1984                             &xmm_mask_lo, &xmm_mask_hi);
1985
1986         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1987                       &xmm_mask_lo, &xmm_mask_hi);
1988
1989         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1990                             &xmm_mask_lo, &xmm_mask_hi,
1991                             &xmm_dst_lo, &xmm_dst_hi);
1992
1993         save_128_aligned (
1994             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1995
1996         ps += 4;
1997         pd += 4;
1998         pm += 4;
1999         w -= 4;
2000     }
2001
2002     while (w)
2003     {
2004         s = *ps++;
2005         m = *pm++;
2006         d = *pd;
2007
2008         *pd++ = pack_1x128_32 (
2009             pix_multiply_1x128 (
2010                 unpack_32_1x128 (d),
2011                 negate_1x128 (pix_multiply_1x128 (
2012                                  unpack_32_1x128 (m),
2013                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
2014         w--;
2015     }
2016 }
2017
2018 static force_inline uint32_t
2019 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2020                                  uint32_t mask,
2021                                  uint32_t dst)
2022 {
2023     __m128i m = unpack_32_1x128 (mask);
2024     __m128i s = unpack_32_1x128 (src);
2025     __m128i d = unpack_32_1x128 (dst);
2026     __m128i sa = expand_alpha_1x128 (s);
2027     __m128i da = expand_alpha_1x128 (d);
2028
2029     s = pix_multiply_1x128 (s, m);
2030     m = negate_1x128 (pix_multiply_1x128 (m, sa));
2031
2032     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2033 }
2034
2035 static void
2036 sse2_combine_atop_ca (pixman_implementation_t *imp,
2037                       pixman_op_t              op,
2038                       uint32_t *               pd,
2039                       const uint32_t *         ps,
2040                       const uint32_t *         pm,
2041                       int                      w)
2042 {
2043     uint32_t s, m, d;
2044
2045     __m128i xmm_src_lo, xmm_src_hi;
2046     __m128i xmm_dst_lo, xmm_dst_hi;
2047     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2048     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2049     __m128i xmm_mask_lo, xmm_mask_hi;
2050
2051     while (w && (unsigned long)pd & 15)
2052     {
2053         s = *ps++;
2054         m = *pm++;
2055         d = *pd;
2056
2057         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2058         w--;
2059     }
2060
2061     while (w >= 4)
2062     {
2063         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2064         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2065         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2066
2067         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2068         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2069         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2070
2071         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2072                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2073         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2074                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2075
2076         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2077                             &xmm_mask_lo, &xmm_mask_hi,
2078                             &xmm_src_lo, &xmm_src_hi);
2079         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2080                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2081                             &xmm_mask_lo, &xmm_mask_hi);
2082
2083         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2084
2085         pix_add_multiply_2x128 (
2086             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2087             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2088             &xmm_dst_lo, &xmm_dst_hi);
2089
2090         save_128_aligned (
2091             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2092
2093         ps += 4;
2094         pd += 4;
2095         pm += 4;
2096         w -= 4;
2097     }
2098
2099     while (w)
2100     {
2101         s = *ps++;
2102         m = *pm++;
2103         d = *pd;
2104
2105         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2106         w--;
2107     }
2108 }
2109
2110 static force_inline uint32_t
2111 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2112                                          uint32_t mask,
2113                                          uint32_t dst)
2114 {
2115     __m128i m = unpack_32_1x128 (mask);
2116     __m128i s = unpack_32_1x128 (src);
2117     __m128i d = unpack_32_1x128 (dst);
2118
2119     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2120     __m128i sa = expand_alpha_1x128 (s);
2121
2122     s = pix_multiply_1x128 (s, m);
2123     m = pix_multiply_1x128 (m, sa);
2124
2125     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2126 }
2127
2128 static void
2129 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2130                               pixman_op_t              op,
2131                               uint32_t *               pd,
2132                               const uint32_t *         ps,
2133                               const uint32_t *         pm,
2134                               int                      w)
2135 {
2136     uint32_t s, m, d;
2137
2138     __m128i xmm_src_lo, xmm_src_hi;
2139     __m128i xmm_dst_lo, xmm_dst_hi;
2140     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2141     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2142     __m128i xmm_mask_lo, xmm_mask_hi;
2143
2144     while (w && (unsigned long)pd & 15)
2145     {
2146         s = *ps++;
2147         m = *pm++;
2148         d = *pd;
2149
2150         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2151         w--;
2152     }
2153
2154     while (w >= 4)
2155     {
2156         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2157         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2158         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2159
2160         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2161         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2162         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2163
2164         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2165                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2166         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2167                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2168
2169         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2170                             &xmm_mask_lo, &xmm_mask_hi,
2171                             &xmm_src_lo, &xmm_src_hi);
2172         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2173                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2174                             &xmm_mask_lo, &xmm_mask_hi);
2175
2176         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2177                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2178
2179         pix_add_multiply_2x128 (
2180             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2181             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2182             &xmm_dst_lo, &xmm_dst_hi);
2183
2184         save_128_aligned (
2185             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2186
2187         ps += 4;
2188         pd += 4;
2189         pm += 4;
2190         w -= 4;
2191     }
2192
2193     while (w)
2194     {
2195         s = *ps++;
2196         m = *pm++;
2197         d = *pd;
2198
2199         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2200         w--;
2201     }
2202 }
2203
2204 static force_inline uint32_t
2205 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2206                                 uint32_t mask,
2207                                 uint32_t dst)
2208 {
2209     __m128i a = unpack_32_1x128 (mask);
2210     __m128i s = unpack_32_1x128 (src);
2211     __m128i d = unpack_32_1x128 (dst);
2212
2213     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2214                                        a, expand_alpha_1x128 (s)));
2215     __m128i dest      = pix_multiply_1x128 (s, a);
2216     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2217
2218     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2219                                                 &alpha_dst,
2220                                                 &dest,
2221                                                 &alpha_src));
2222 }
2223
2224 static void
2225 sse2_combine_xor_ca (pixman_implementation_t *imp,
2226                      pixman_op_t              op,
2227                      uint32_t *               pd,
2228                      const uint32_t *         ps,
2229                      const uint32_t *         pm,
2230                      int                      w)
2231 {
2232     uint32_t s, m, d;
2233
2234     __m128i xmm_src_lo, xmm_src_hi;
2235     __m128i xmm_dst_lo, xmm_dst_hi;
2236     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2237     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2238     __m128i xmm_mask_lo, xmm_mask_hi;
2239
2240     while (w && (unsigned long)pd & 15)
2241     {
2242         s = *ps++;
2243         m = *pm++;
2244         d = *pd;
2245
2246         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2247         w--;
2248     }
2249
2250     while (w >= 4)
2251     {
2252         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2253         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2254         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2255
2256         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2257         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2258         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2259
2260         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2261                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2262         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2263                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2264
2265         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2266                             &xmm_mask_lo, &xmm_mask_hi,
2267                             &xmm_src_lo, &xmm_src_hi);
2268         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2269                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2270                             &xmm_mask_lo, &xmm_mask_hi);
2271
2272         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2273                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2274         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2275                       &xmm_mask_lo, &xmm_mask_hi);
2276
2277         pix_add_multiply_2x128 (
2278             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2279             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2280             &xmm_dst_lo, &xmm_dst_hi);
2281
2282         save_128_aligned (
2283             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2284
2285         ps += 4;
2286         pd += 4;
2287         pm += 4;
2288         w -= 4;
2289     }
2290
2291     while (w)
2292     {
2293         s = *ps++;
2294         m = *pm++;
2295         d = *pd;
2296
2297         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2298         w--;
2299     }
2300 }
2301
2302 static void
2303 sse2_combine_add_ca (pixman_implementation_t *imp,
2304                      pixman_op_t              op,
2305                      uint32_t *               pd,
2306                      const uint32_t *         ps,
2307                      const uint32_t *         pm,
2308                      int                      w)
2309 {
2310     uint32_t s, m, d;
2311
2312     __m128i xmm_src_lo, xmm_src_hi;
2313     __m128i xmm_dst_lo, xmm_dst_hi;
2314     __m128i xmm_mask_lo, xmm_mask_hi;
2315
2316     while (w && (unsigned long)pd & 15)
2317     {
2318         s = *ps++;
2319         m = *pm++;
2320         d = *pd;
2321
2322         *pd++ = pack_1x128_32 (
2323             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2324                                                unpack_32_1x128 (m)),
2325                            unpack_32_1x128 (d)));
2326         w--;
2327     }
2328
2329     while (w >= 4)
2330     {
2331         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2332         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2333         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2334
2335         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2336         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2337         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2338
2339         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2340                             &xmm_mask_lo, &xmm_mask_hi,
2341                             &xmm_src_lo, &xmm_src_hi);
2342
2343         save_128_aligned (
2344             (__m128i*)pd, pack_2x128_128 (
2345                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2346                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2347
2348         ps += 4;
2349         pd += 4;
2350         pm += 4;
2351         w -= 4;
2352     }
2353
2354     while (w)
2355     {
2356         s = *ps++;
2357         m = *pm++;
2358         d = *pd;
2359
2360         *pd++ = pack_1x128_32 (
2361             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2362                                                unpack_32_1x128 (m)),
2363                            unpack_32_1x128 (d)));
2364         w--;
2365     }
2366 }
2367
2368 /* ---------------------------------------------------
2369  * fb_compose_setup_sSE2
2370  */
2371 static force_inline __m128i
2372 create_mask_16_128 (uint16_t mask)
2373 {
2374     return _mm_set1_epi16 (mask);
2375 }
2376
2377 /* Work around a code generation bug in Sun Studio 12. */
2378 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2379 # define create_mask_2x32_128(mask0, mask1)                             \
2380     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2381 #else
2382 static force_inline __m128i
2383 create_mask_2x32_128 (uint32_t mask0,
2384                       uint32_t mask1)
2385 {
2386     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2387 }
2388 #endif
2389
2390 /* -------------------------------------------------------------------
2391  * composite_over_n_8888
2392  */
2393
2394 static void
2395 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2396                             pixman_op_t              op,
2397                             pixman_image_t *         src_image,
2398                             pixman_image_t *         mask_image,
2399                             pixman_image_t *         dst_image,
2400                             int32_t                  src_x,
2401                             int32_t                  src_y,
2402                             int32_t                  mask_x,
2403                             int32_t                  mask_y,
2404                             int32_t                  dest_x,
2405                             int32_t                  dest_y,
2406                             int32_t                  width,
2407                             int32_t                  height)
2408 {
2409     uint32_t src;
2410     uint32_t    *dst_line, *dst, d;
2411     int32_t w;
2412     int dst_stride;
2413     __m128i xmm_src, xmm_alpha;
2414     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2415
2416     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2417
2418     if (src == 0)
2419         return;
2420
2421     PIXMAN_IMAGE_GET_LINE (
2422         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2423
2424     xmm_src = expand_pixel_32_1x128 (src);
2425     xmm_alpha = expand_alpha_1x128 (xmm_src);
2426
2427     while (height--)
2428     {
2429         dst = dst_line;
2430
2431         dst_line += dst_stride;
2432         w = width;
2433
2434         while (w && (unsigned long)dst & 15)
2435         {
2436             d = *dst;
2437             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2438                                                 xmm_alpha,
2439                                                 unpack_32_1x128 (d)));
2440             w--;
2441         }
2442
2443         while (w >= 4)
2444         {
2445             xmm_dst = load_128_aligned ((__m128i*)dst);
2446
2447             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2448
2449             over_2x128 (&xmm_src, &xmm_src,
2450                         &xmm_alpha, &xmm_alpha,
2451                         &xmm_dst_lo, &xmm_dst_hi);
2452
2453             /* rebuid the 4 pixel data and save*/
2454             save_128_aligned (
2455                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2456
2457             w -= 4;
2458             dst += 4;
2459         }
2460
2461         while (w)
2462         {
2463             d = *dst;
2464             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2465                                                 xmm_alpha,
2466                                                 unpack_32_1x128 (d)));
2467             w--;
2468         }
2469
2470     }
2471 }
2472
2473 /* ---------------------------------------------------------------------
2474  * composite_over_n_0565
2475  */
2476 static void
2477 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2478                             pixman_op_t              op,
2479                             pixman_image_t *         src_image,
2480                             pixman_image_t *         mask_image,
2481                             pixman_image_t *         dst_image,
2482                             int32_t                  src_x,
2483                             int32_t                  src_y,
2484                             int32_t                  mask_x,
2485                             int32_t                  mask_y,
2486                             int32_t                  dest_x,
2487                             int32_t                  dest_y,
2488                             int32_t                  width,
2489                             int32_t                  height)
2490 {
2491     uint32_t src;
2492     uint16_t    *dst_line, *dst, d;
2493     int32_t w;
2494     int dst_stride;
2495     __m128i xmm_src, xmm_alpha;
2496     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2497
2498     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2499
2500     if (src == 0)
2501         return;
2502
2503     PIXMAN_IMAGE_GET_LINE (
2504         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2505
2506     xmm_src = expand_pixel_32_1x128 (src);
2507     xmm_alpha = expand_alpha_1x128 (xmm_src);
2508
2509     while (height--)
2510     {
2511         dst = dst_line;
2512
2513         dst_line += dst_stride;
2514         w = width;
2515
2516         while (w && (unsigned long)dst & 15)
2517         {
2518             d = *dst;
2519
2520             *dst++ = pack_565_32_16 (
2521                 pack_1x128_32 (over_1x128 (xmm_src,
2522                                            xmm_alpha,
2523                                            expand565_16_1x128 (d))));
2524             w--;
2525         }
2526
2527         while (w >= 8)
2528         {
2529             xmm_dst = load_128_aligned ((__m128i*)dst);
2530
2531             unpack_565_128_4x128 (xmm_dst,
2532                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2533
2534             over_2x128 (&xmm_src, &xmm_src,
2535                         &xmm_alpha, &xmm_alpha,
2536                         &xmm_dst0, &xmm_dst1);
2537             over_2x128 (&xmm_src, &xmm_src,
2538                         &xmm_alpha, &xmm_alpha,
2539                         &xmm_dst2, &xmm_dst3);
2540
2541             xmm_dst = pack_565_4x128_128 (
2542                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2543
2544             save_128_aligned ((__m128i*)dst, xmm_dst);
2545
2546             dst += 8;
2547             w -= 8;
2548         }
2549
2550         while (w--)
2551         {
2552             d = *dst;
2553             *dst++ = pack_565_32_16 (
2554                 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2555                                            expand565_16_1x128 (d))));
2556         }
2557     }
2558
2559 }
2560
2561 /* ------------------------------
2562  * composite_add_n_8888_8888_ca
2563  */
2564 static void
2565 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2566                                    pixman_op_t              op,
2567                                    pixman_image_t *         src_image,
2568                                    pixman_image_t *         mask_image,
2569                                    pixman_image_t *         dst_image,
2570                                    int32_t                  src_x,
2571                                    int32_t                  src_y,
2572                                    int32_t                  mask_x,
2573                                    int32_t                  mask_y,
2574                                    int32_t                  dest_x,
2575                                    int32_t                  dest_y,
2576                                    int32_t                  width,
2577                                    int32_t                  height)
2578 {
2579     uint32_t src, srca;
2580     uint32_t    *dst_line, d;
2581     uint32_t    *mask_line, m;
2582     uint32_t pack_cmp;
2583     int dst_stride, mask_stride;
2584
2585     __m128i xmm_src, xmm_alpha;
2586     __m128i xmm_dst;
2587     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2588
2589     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2590
2591     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2592     srca = src >> 24;
2593
2594     if (src == 0)
2595         return;
2596
2597     PIXMAN_IMAGE_GET_LINE (
2598         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2599     PIXMAN_IMAGE_GET_LINE (
2600         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2601
2602     xmm_src = _mm_unpacklo_epi8 (
2603         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2604     xmm_alpha = expand_alpha_1x128 (xmm_src);
2605     mmx_src   = xmm_src;
2606     mmx_alpha = xmm_alpha;
2607
2608     while (height--)
2609     {
2610         int w = width;
2611         const uint32_t *pm = (uint32_t *)mask_line;
2612         uint32_t *pd = (uint32_t *)dst_line;
2613
2614         dst_line += dst_stride;
2615         mask_line += mask_stride;
2616
2617         while (w && (unsigned long)pd & 15)
2618         {
2619             m = *pm++;
2620
2621             if (m)
2622             {
2623                 d = *pd;
2624
2625                 mmx_mask = unpack_32_1x128 (m);
2626                 mmx_dest = unpack_32_1x128 (d);
2627
2628                 *pd = pack_1x128_32 (
2629                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2630             }
2631
2632             pd++;
2633             w--;
2634         }
2635
2636         while (w >= 4)
2637         {
2638             xmm_mask = load_128_unaligned ((__m128i*)pm);
2639
2640             pack_cmp =
2641                 _mm_movemask_epi8 (
2642                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2643
2644             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2645             if (pack_cmp != 0xffff)
2646             {
2647                 xmm_dst = load_128_aligned ((__m128i*)pd);
2648
2649                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2650
2651                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2652                                     &xmm_mask_lo, &xmm_mask_hi,
2653                                     &xmm_mask_lo, &xmm_mask_hi);
2654                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2655
2656                 save_128_aligned (
2657                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2658             }
2659
2660             pd += 4;
2661             pm += 4;
2662             w -= 4;
2663         }
2664
2665         while (w)
2666         {
2667             m = *pm++;
2668
2669             if (m)
2670             {
2671                 d = *pd;
2672
2673                 mmx_mask = unpack_32_1x128 (m);
2674                 mmx_dest = unpack_32_1x128 (d);
2675
2676                 *pd = pack_1x128_32 (
2677                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2678             }
2679
2680             pd++;
2681             w--;
2682         }
2683     }
2684
2685 }
2686
2687 /* ---------------------------------------------------------------------------
2688  * composite_over_n_8888_8888_ca
2689  */
2690
2691 static void
2692 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2693                                     pixman_op_t              op,
2694                                     pixman_image_t *         src_image,
2695                                     pixman_image_t *         mask_image,
2696                                     pixman_image_t *         dst_image,
2697                                     int32_t                  src_x,
2698                                     int32_t                  src_y,
2699                                     int32_t                  mask_x,
2700                                     int32_t                  mask_y,
2701                                     int32_t                  dest_x,
2702                                     int32_t                  dest_y,
2703                                     int32_t                  width,
2704                                     int32_t                  height)
2705 {
2706     uint32_t src;
2707     uint32_t    *dst_line, d;
2708     uint32_t    *mask_line, m;
2709     uint32_t pack_cmp;
2710     int dst_stride, mask_stride;
2711
2712     __m128i xmm_src, xmm_alpha;
2713     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2714     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2715
2716     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2717
2718     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2719
2720     if (src == 0)
2721         return;
2722
2723     PIXMAN_IMAGE_GET_LINE (
2724         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2725     PIXMAN_IMAGE_GET_LINE (
2726         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2727
2728     xmm_src = _mm_unpacklo_epi8 (
2729         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2730     xmm_alpha = expand_alpha_1x128 (xmm_src);
2731     mmx_src   = xmm_src;
2732     mmx_alpha = xmm_alpha;
2733
2734     while (height--)
2735     {
2736         int w = width;
2737         const uint32_t *pm = (uint32_t *)mask_line;
2738         uint32_t *pd = (uint32_t *)dst_line;
2739
2740         dst_line += dst_stride;
2741         mask_line += mask_stride;
2742
2743         while (w && (unsigned long)pd & 15)
2744         {
2745             m = *pm++;
2746
2747             if (m)
2748             {
2749                 d = *pd;
2750                 mmx_mask = unpack_32_1x128 (m);
2751                 mmx_dest = unpack_32_1x128 (d);
2752
2753                 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2754                                                   &mmx_alpha,
2755                                                   &mmx_mask,
2756                                                   &mmx_dest));
2757             }
2758
2759             pd++;
2760             w--;
2761         }
2762
2763         while (w >= 4)
2764         {
2765             xmm_mask = load_128_unaligned ((__m128i*)pm);
2766
2767             pack_cmp =
2768                 _mm_movemask_epi8 (
2769                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2770
2771             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2772             if (pack_cmp != 0xffff)
2773             {
2774                 xmm_dst = load_128_aligned ((__m128i*)pd);
2775
2776                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2777                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2778
2779                 in_over_2x128 (&xmm_src, &xmm_src,
2780                                &xmm_alpha, &xmm_alpha,
2781                                &xmm_mask_lo, &xmm_mask_hi,
2782                                &xmm_dst_lo, &xmm_dst_hi);
2783
2784                 save_128_aligned (
2785                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2786             }
2787
2788             pd += 4;
2789             pm += 4;
2790             w -= 4;
2791         }
2792
2793         while (w)
2794         {
2795             m = *pm++;
2796
2797             if (m)
2798             {
2799                 d = *pd;
2800                 mmx_mask = unpack_32_1x128 (m);
2801                 mmx_dest = unpack_32_1x128 (d);
2802
2803                 *pd = pack_1x128_32 (
2804                     in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2805             }
2806
2807             pd++;
2808             w--;
2809         }
2810     }
2811
2812 }
2813
2814 /*---------------------------------------------------------------------
2815  * composite_over_8888_n_8888
2816  */
2817
2818 static void
2819 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2820                                  pixman_op_t              op,
2821                                  pixman_image_t *         src_image,
2822                                  pixman_image_t *         mask_image,
2823                                  pixman_image_t *         dst_image,
2824                                  int32_t                  src_x,
2825                                  int32_t                  src_y,
2826                                  int32_t                  mask_x,
2827                                  int32_t                  mask_y,
2828                                  int32_t                  dest_x,
2829                                  int32_t                  dest_y,
2830                                  int32_t                  width,
2831                                  int32_t                  height)
2832 {
2833     uint32_t    *dst_line, *dst;
2834     uint32_t    *src_line, *src;
2835     uint32_t mask;
2836     int32_t w;
2837     int dst_stride, src_stride;
2838
2839     __m128i xmm_mask;
2840     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2841     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2842     __m128i xmm_alpha_lo, xmm_alpha_hi;
2843
2844     PIXMAN_IMAGE_GET_LINE (
2845         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2846     PIXMAN_IMAGE_GET_LINE (
2847         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2848
2849     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2850
2851     xmm_mask = create_mask_16_128 (mask >> 24);
2852
2853     while (height--)
2854     {
2855         dst = dst_line;
2856         dst_line += dst_stride;
2857         src = src_line;
2858         src_line += src_stride;
2859         w = width;
2860
2861         while (w && (unsigned long)dst & 15)
2862         {
2863             uint32_t s = *src++;
2864
2865             if (s)
2866             {
2867                 uint32_t d = *dst;
2868
2869                 __m128i ms = unpack_32_1x128 (s);
2870                 __m128i alpha    = expand_alpha_1x128 (ms);
2871                 __m128i dest     = xmm_mask;
2872                 __m128i alpha_dst = unpack_32_1x128 (d);
2873
2874                 *dst = pack_1x128_32 (
2875                     in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2876             }
2877             dst++;
2878             w--;
2879         }
2880
2881         while (w >= 4)
2882         {
2883             xmm_src = load_128_unaligned ((__m128i*)src);
2884
2885             if (!is_zero (xmm_src))
2886             {
2887                 xmm_dst = load_128_aligned ((__m128i*)dst);
2888
2889                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2890                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2891                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2892                                     &xmm_alpha_lo, &xmm_alpha_hi);
2893
2894                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2895                                &xmm_alpha_lo, &xmm_alpha_hi,
2896                                &xmm_mask, &xmm_mask,
2897                                &xmm_dst_lo, &xmm_dst_hi);
2898
2899                 save_128_aligned (
2900                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2901             }
2902
2903             dst += 4;
2904             src += 4;
2905             w -= 4;
2906         }
2907
2908         while (w)
2909         {
2910             uint32_t s = *src++;
2911
2912             if (s)
2913             {
2914                 uint32_t d = *dst;
2915
2916                 __m128i ms = unpack_32_1x128 (s);
2917                 __m128i alpha = expand_alpha_1x128 (ms);
2918                 __m128i mask  = xmm_mask;
2919                 __m128i dest  = unpack_32_1x128 (d);
2920
2921                 *dst = pack_1x128_32 (
2922                     in_over_1x128 (&ms, &alpha, &mask, &dest));
2923             }
2924
2925             dst++;
2926             w--;
2927         }
2928     }
2929
2930 }
2931
2932 /*---------------------------------------------------------------------
2933  * composite_over_8888_n_8888
2934  */
2935
2936 static void
2937 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2938                               pixman_op_t              op,
2939                               pixman_image_t *         src_image,
2940                               pixman_image_t *         mask_image,
2941                               pixman_image_t *         dst_image,
2942                               int32_t                  src_x,
2943                               int32_t                  src_y,
2944                               int32_t                  mask_x,
2945                               int32_t                  mask_y,
2946                               int32_t                  dest_x,
2947                               int32_t                  dest_y,
2948                               int32_t                  width,
2949                               int32_t                  height)
2950 {
2951     uint32_t    *dst_line, *dst;
2952     uint32_t    *src_line, *src;
2953     int32_t w;
2954     int dst_stride, src_stride;
2955
2956
2957     PIXMAN_IMAGE_GET_LINE (
2958         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2959     PIXMAN_IMAGE_GET_LINE (
2960         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2961
2962     while (height--)
2963     {
2964         dst = dst_line;
2965         dst_line += dst_stride;
2966         src = src_line;
2967         src_line += src_stride;
2968         w = width;
2969
2970         while (w && (unsigned long)dst & 15)
2971         {
2972             *dst++ = *src++ | 0xff000000;
2973             w--;
2974         }
2975
2976         while (w >= 16)
2977         {
2978             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2979
2980             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2981             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2982             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2983             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2984
2985             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2986             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2987             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2988             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2989
2990             dst += 16;
2991             src += 16;
2992             w -= 16;
2993         }
2994
2995         while (w)
2996         {
2997             *dst++ = *src++ | 0xff000000;
2998             w--;
2999         }
3000     }
3001
3002 }
3003
3004 /* ---------------------------------------------------------------------
3005  * composite_over_x888_n_8888
3006  */
3007 static void
3008 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3009                                  pixman_op_t              op,
3010                                  pixman_image_t *         src_image,
3011                                  pixman_image_t *         mask_image,
3012                                  pixman_image_t *         dst_image,
3013                                  int32_t                  src_x,
3014                                  int32_t                  src_y,
3015                                  int32_t                  mask_x,
3016                                  int32_t                  mask_y,
3017                                  int32_t                  dest_x,
3018                                  int32_t                  dest_y,
3019                                  int32_t                  width,
3020                                  int32_t                  height)
3021 {
3022     uint32_t    *dst_line, *dst;
3023     uint32_t    *src_line, *src;
3024     uint32_t mask;
3025     int dst_stride, src_stride;
3026     int32_t w;
3027
3028     __m128i xmm_mask, xmm_alpha;
3029     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3030     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3031
3032     PIXMAN_IMAGE_GET_LINE (
3033         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3034     PIXMAN_IMAGE_GET_LINE (
3035         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3036
3037     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3038
3039     xmm_mask = create_mask_16_128 (mask >> 24);
3040     xmm_alpha = mask_00ff;
3041
3042     while (height--)
3043     {
3044         dst = dst_line;
3045         dst_line += dst_stride;
3046         src = src_line;
3047         src_line += src_stride;
3048         w = width;
3049
3050         while (w && (unsigned long)dst & 15)
3051         {
3052             uint32_t s = (*src++) | 0xff000000;
3053             uint32_t d = *dst;
3054
3055             __m128i src   = unpack_32_1x128 (s);
3056             __m128i alpha = xmm_alpha;
3057             __m128i mask  = xmm_mask;
3058             __m128i dest  = unpack_32_1x128 (d);
3059
3060             *dst++ = pack_1x128_32 (
3061                 in_over_1x128 (&src, &alpha, &mask, &dest));
3062
3063             w--;
3064         }
3065
3066         while (w >= 4)
3067         {
3068             xmm_src = _mm_or_si128 (
3069                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3070             xmm_dst = load_128_aligned ((__m128i*)dst);
3071
3072             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3073             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3074
3075             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3076                            &xmm_alpha, &xmm_alpha,
3077                            &xmm_mask, &xmm_mask,
3078                            &xmm_dst_lo, &xmm_dst_hi);
3079
3080             save_128_aligned (
3081                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3082
3083             dst += 4;
3084             src += 4;
3085             w -= 4;
3086
3087         }
3088
3089         while (w)
3090         {
3091             uint32_t s = (*src++) | 0xff000000;
3092             uint32_t d = *dst;
3093
3094             __m128i src  = unpack_32_1x128 (s);
3095             __m128i alpha = xmm_alpha;
3096             __m128i mask  = xmm_mask;
3097             __m128i dest  = unpack_32_1x128 (d);
3098
3099             *dst++ = pack_1x128_32 (
3100                 in_over_1x128 (&src, &alpha, &mask, &dest));
3101
3102             w--;
3103         }
3104     }
3105
3106 }
3107
3108 /* --------------------------------------------------------------------
3109  * composite_over_8888_8888
3110  */
3111 static void
3112 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3113                                pixman_op_t              op,
3114                                pixman_image_t *         src_image,
3115                                pixman_image_t *         mask_image,
3116                                pixman_image_t *         dst_image,
3117                                int32_t                  src_x,
3118                                int32_t                  src_y,
3119                                int32_t                  mask_x,
3120                                int32_t                  mask_y,
3121                                int32_t                  dest_x,
3122                                int32_t                  dest_y,
3123                                int32_t                  width,
3124                                int32_t                  height)
3125 {
3126     int dst_stride, src_stride;
3127     uint32_t    *dst_line, *dst;
3128     uint32_t    *src_line, *src;
3129
3130     PIXMAN_IMAGE_GET_LINE (
3131         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3132     PIXMAN_IMAGE_GET_LINE (
3133         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3134
3135     dst = dst_line;
3136     src = src_line;
3137
3138     while (height--)
3139     {
3140         sse2_combine_over_u (imp, op, dst, src, NULL, width);
3141
3142         dst += dst_stride;
3143         src += src_stride;
3144     }
3145 }
3146
3147 /* ------------------------------------------------------------------
3148  * composite_over_8888_0565
3149  */
3150 static force_inline uint16_t
3151 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3152 {
3153     __m128i ms;
3154
3155     ms = unpack_32_1x128 (src);
3156     return pack_565_32_16 (
3157         pack_1x128_32 (
3158             over_1x128 (
3159                 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3160 }
3161
3162 static void
3163 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3164                                pixman_op_t              op,
3165                                pixman_image_t *         src_image,
3166                                pixman_image_t *         mask_image,
3167                                pixman_image_t *         dst_image,
3168                                int32_t                  src_x,
3169                                int32_t                  src_y,
3170                                int32_t                  mask_x,
3171                                int32_t                  mask_y,
3172                                int32_t                  dest_x,
3173                                int32_t                  dest_y,
3174                                int32_t                  width,
3175                                int32_t                  height)
3176 {
3177     uint16_t    *dst_line, *dst, d;
3178     uint32_t    *src_line, *src, s;
3179     int dst_stride, src_stride;
3180     int32_t w;
3181
3182     __m128i xmm_alpha_lo, xmm_alpha_hi;
3183     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3184     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3185
3186     PIXMAN_IMAGE_GET_LINE (
3187         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3188     PIXMAN_IMAGE_GET_LINE (
3189         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3190
3191 #if 0
3192     /* FIXME
3193      *
3194      * I copy the code from MMX one and keep the fixme.
3195      * If it's a problem there, probably is a problem here.
3196      */
3197     assert (src_image->drawable == mask_image->drawable);
3198 #endif
3199
3200     while (height--)
3201     {
3202         dst = dst_line;
3203         src = src_line;
3204
3205         dst_line += dst_stride;
3206         src_line += src_stride;
3207         w = width;
3208
3209         /* Align dst on a 16-byte boundary */
3210         while (w &&
3211                ((unsigned long)dst & 15))
3212         {
3213             s = *src++;
3214             d = *dst;
3215
3216             *dst++ = composite_over_8888_0565pixel (s, d);
3217             w--;
3218         }
3219
3220         /* It's a 8 pixel loop */
3221         while (w >= 8)
3222         {
3223             /* I'm loading unaligned because I'm not sure
3224              * about the address alignment.
3225              */
3226             xmm_src = load_128_unaligned ((__m128i*) src);
3227             xmm_dst = load_128_aligned ((__m128i*) dst);
3228
3229             /* Unpacking */
3230             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3231             unpack_565_128_4x128 (xmm_dst,
3232                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3233             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3234                                 &xmm_alpha_lo, &xmm_alpha_hi);
3235
3236             /* I'm loading next 4 pixels from memory
3237              * before to optimze the memory read.
3238              */
3239             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3240
3241             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3242                         &xmm_alpha_lo, &xmm_alpha_hi,
3243                         &xmm_dst0, &xmm_dst1);
3244
3245             /* Unpacking */
3246             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3247             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3248                                 &xmm_alpha_lo, &xmm_alpha_hi);
3249
3250             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3251                         &xmm_alpha_lo, &xmm_alpha_hi,
3252                         &xmm_dst2, &xmm_dst3);
3253
3254             save_128_aligned (
3255                 (__m128i*)dst, pack_565_4x128_128 (
3256                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3257
3258             w -= 8;
3259             dst += 8;
3260             src += 8;
3261         }
3262
3263         while (w--)
3264         {
3265             s = *src++;
3266             d = *dst;
3267
3268             *dst++ = composite_over_8888_0565pixel (s, d);
3269         }
3270     }
3271
3272 }
3273
3274 /* -----------------------------------------------------------------
3275  * composite_over_n_8_8888
3276  */
3277
3278 static void
3279 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3280                               pixman_op_t              op,
3281                               pixman_image_t *         src_image,
3282                               pixman_image_t *         mask_image,
3283                               pixman_image_t *         dst_image,
3284                               int32_t                  src_x,
3285                               int32_t                  src_y,
3286                               int32_t                  mask_x,
3287                               int32_t                  mask_y,
3288                               int32_t                  dest_x,
3289                               int32_t                  dest_y,
3290                               int32_t                  width,
3291                               int32_t                  height)
3292 {
3293     uint32_t src, srca;
3294     uint32_t *dst_line, *dst;
3295     uint8_t *mask_line, *mask;
3296     int dst_stride, mask_stride;
3297     int32_t w;
3298     uint32_t m, d;
3299
3300     __m128i xmm_src, xmm_alpha, xmm_def;
3301     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3302     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3303
3304     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3305
3306     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3307
3308     srca = src >> 24;
3309     if (src == 0)
3310         return;
3311
3312     PIXMAN_IMAGE_GET_LINE (
3313         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3314     PIXMAN_IMAGE_GET_LINE (
3315         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3316
3317     xmm_def = create_mask_2x32_128 (src, src);
3318     xmm_src = expand_pixel_32_1x128 (src);
3319     xmm_alpha = expand_alpha_1x128 (xmm_src);
3320     mmx_src   = xmm_src;
3321     mmx_alpha = xmm_alpha;
3322
3323     while (height--)
3324     {
3325         dst = dst_line;
3326         dst_line += dst_stride;
3327         mask = mask_line;
3328         mask_line += mask_stride;
3329         w = width;
3330
3331         while (w && (unsigned long)dst & 15)
3332         {
3333             uint8_t m = *mask++;
3334
3335             if (m)
3336             {
3337                 d = *dst;
3338                 mmx_mask = expand_pixel_8_1x128 (m);
3339                 mmx_dest = unpack_32_1x128 (d);
3340
3341                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3342                                                    &mmx_alpha,
3343                                                    &mmx_mask,
3344                                                    &mmx_dest));
3345             }
3346
3347             w--;
3348             dst++;
3349         }
3350
3351         while (w >= 4)
3352         {
3353             m = *((uint32_t*)mask);
3354
3355             if (srca == 0xff && m == 0xffffffff)
3356             {
3357                 save_128_aligned ((__m128i*)dst, xmm_def);
3358             }
3359             else if (m)
3360             {
3361                 xmm_dst = load_128_aligned ((__m128i*) dst);
3362                 xmm_mask = unpack_32_1x128 (m);
3363                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3364
3365                 /* Unpacking */
3366                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3367                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3368
3369                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3370                                         &xmm_mask_lo, &xmm_mask_hi);
3371
3372                 in_over_2x128 (&xmm_src, &xmm_src,
3373                                &xmm_alpha, &xmm_alpha,
3374                                &xmm_mask_lo, &xmm_mask_hi,
3375                                &xmm_dst_lo, &xmm_dst_hi);
3376
3377                 save_128_aligned (
3378                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3379             }
3380
3381             w -= 4;
3382             dst += 4;
3383             mask += 4;
3384         }
3385
3386         while (w)
3387         {
3388             uint8_t m = *mask++;
3389
3390             if (m)
3391             {
3392                 d = *dst;
3393                 mmx_mask = expand_pixel_8_1x128 (m);
3394                 mmx_dest = unpack_32_1x128 (d);
3395
3396                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3397                                                    &mmx_alpha,
3398                                                    &mmx_mask,
3399                                                    &mmx_dest));
3400             }
3401
3402             w--;
3403             dst++;
3404         }
3405     }
3406
3407 }
3408
3409 /* ----------------------------------------------------------------
3410  * composite_over_n_8_8888
3411  */
3412
3413 pixman_bool_t
3414 pixman_fill_sse2 (uint32_t *bits,
3415                   int       stride,
3416                   int       bpp,
3417                   int       x,
3418                   int       y,
3419                   int       width,
3420                   int       height,
3421                   uint32_t  data)
3422 {
3423     uint32_t byte_width;
3424     uint8_t         *byte_line;
3425
3426     __m128i xmm_def;
3427
3428     if (bpp == 8)
3429     {
3430         uint8_t b;
3431         uint16_t w;
3432
3433         stride = stride * (int) sizeof (uint32_t) / 1;
3434         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3435         byte_width = width;
3436         stride *= 1;
3437
3438         b = data & 0xff;
3439         w = (b << 8) | b;
3440         data = (w << 16) | w;
3441     }
3442     else if (bpp == 16)
3443     {
3444         stride = stride * (int) sizeof (uint32_t) / 2;
3445         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3446         byte_width = 2 * width;
3447         stride *= 2;
3448
3449         data = (data & 0xffff) * 0x00010001;
3450     }
3451     else if (bpp == 32)
3452     {
3453         stride = stride * (int) sizeof (uint32_t) / 4;
3454         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3455         byte_width = 4 * width;
3456         stride *= 4;
3457     }
3458     else
3459     {
3460         return FALSE;
3461     }
3462
3463     xmm_def = create_mask_2x32_128 (data, data);
3464
3465     while (height--)
3466     {
3467         int w;
3468         uint8_t *d = byte_line;
3469         byte_line += stride;
3470         w = byte_width;
3471
3472         while (w >= 1 && ((unsigned long)d & 1))
3473         {
3474             *(uint8_t *)d = data;
3475             w -= 1;
3476             d += 1;
3477         }
3478
3479         while (w >= 2 && ((unsigned long)d & 3))
3480         {
3481             *(uint16_t *)d = data;
3482             w -= 2;
3483             d += 2;
3484         }
3485
3486         while (w >= 4 && ((unsigned long)d & 15))
3487         {
3488             *(uint32_t *)d = data;
3489
3490             w -= 4;
3491             d += 4;
3492         }
3493
3494         while (w >= 128)
3495         {
3496             save_128_aligned ((__m128i*)(d),     xmm_def);
3497             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3498             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3499             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3500             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3501             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3502             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3503             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3504
3505             d += 128;
3506             w -= 128;
3507         }
3508
3509         if (w >= 64)
3510         {
3511             save_128_aligned ((__m128i*)(d),     xmm_def);
3512             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3513             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3514             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3515
3516             d += 64;
3517             w -= 64;
3518         }
3519
3520         if (w >= 32)
3521         {
3522             save_128_aligned ((__m128i*)(d),     xmm_def);
3523             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3524
3525             d += 32;
3526             w -= 32;
3527         }
3528
3529         if (w >= 16)
3530         {
3531             save_128_aligned ((__m128i*)(d),     xmm_def);
3532
3533             d += 16;
3534             w -= 16;
3535         }
3536
3537         while (w >= 4)
3538         {
3539             *(uint32_t *)d = data;
3540
3541             w -= 4;
3542             d += 4;
3543         }
3544
3545         if (w >= 2)
3546         {
3547             *(uint16_t *)d = data;
3548             w -= 2;
3549             d += 2;
3550         }
3551
3552         if (w >= 1)
3553         {
3554             *(uint8_t *)d = data;
3555             w -= 1;
3556             d += 1;
3557         }
3558     }
3559
3560     return TRUE;
3561 }
3562
3563 static void
3564 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3565                              pixman_op_t              op,
3566                              pixman_image_t *         src_image,
3567                              pixman_image_t *         mask_image,
3568                              pixman_image_t *         dst_image,
3569                              int32_t                  src_x,
3570                              int32_t                  src_y,
3571                              int32_t                  mask_x,
3572                              int32_t                  mask_y,
3573                              int32_t                  dest_x,
3574                              int32_t                  dest_y,
3575                              int32_t                  width,
3576                              int32_t                  height)
3577 {
3578     uint32_t src, srca;
3579     uint32_t    *dst_line, *dst;
3580     uint8_t     *mask_line, *mask;
3581     int dst_stride, mask_stride;
3582     int32_t w;
3583     uint32_t m;
3584
3585     __m128i xmm_src, xmm_def;
3586     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3587
3588     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3589
3590     srca = src >> 24;
3591     if (src == 0)
3592     {
3593         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3594                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
3595                           dest_x, dest_y, width, height, 0);
3596         return;
3597     }
3598
3599     PIXMAN_IMAGE_GET_LINE (
3600         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3601     PIXMAN_IMAGE_GET_LINE (
3602         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3603
3604     xmm_def = create_mask_2x32_128 (src, src);
3605     xmm_src = expand_pixel_32_1x128 (src);
3606
3607     while (height--)
3608     {
3609         dst = dst_line;
3610         dst_line += dst_stride;
3611         mask = mask_line;
3612         mask_line += mask_stride;
3613         w = width;
3614
3615         while (w && (unsigned long)dst & 15)
3616         {
3617             uint8_t m = *mask++;
3618
3619             if (m)
3620             {
3621                 *dst = pack_1x128_32 (
3622                     pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3623             }
3624             else
3625             {
3626                 *dst = 0;
3627             }
3628
3629             w--;
3630             dst++;
3631         }
3632
3633         while (w >= 4)
3634         {
3635             m = *((uint32_t*)mask);
3636
3637             if (srca == 0xff && m == 0xffffffff)
3638             {
3639                 save_128_aligned ((__m128i*)dst, xmm_def);
3640             }
3641             else if (m)
3642             {
3643                 xmm_mask = unpack_32_1x128 (m);
3644                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3645
3646                 /* Unpacking */
3647                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3648
3649                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3650                                         &xmm_mask_lo, &xmm_mask_hi);
3651
3652                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3653                                     &xmm_mask_lo, &xmm_mask_hi,
3654                                     &xmm_mask_lo, &xmm_mask_hi);
3655
3656                 save_128_aligned (
3657                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3658             }
3659             else
3660             {
3661                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3662             }
3663
3664             w -= 4;
3665             dst += 4;
3666             mask += 4;
3667         }
3668
3669         while (w)
3670         {
3671             uint8_t m = *mask++;
3672
3673             if (m)
3674             {
3675                 *dst = pack_1x128_32 (
3676                     pix_multiply_1x128 (
3677                         xmm_src, expand_pixel_8_1x128 (m)));
3678             }
3679             else
3680             {
3681                 *dst = 0;
3682             }
3683
3684             w--;
3685             dst++;
3686         }
3687     }
3688
3689 }
3690
3691 /*-----------------------------------------------------------------------
3692  * composite_over_n_8_0565
3693  */
3694
3695 static void
3696 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3697                               pixman_op_t              op,
3698                               pixman_image_t *         src_image,
3699                               pixman_image_t *         mask_image,
3700                               pixman_image_t *         dst_image,
3701                               int32_t                  src_x,
3702                               int32_t                  src_y,
3703                               int32_t                  mask_x,
3704                               int32_t                  mask_y,
3705                               int32_t                  dest_x,
3706                               int32_t                  dest_y,
3707                               int32_t                  width,
3708                               int32_t                  height)
3709 {
3710     uint32_t src, srca;
3711     uint16_t    *dst_line, *dst, d;
3712     uint8_t     *mask_line, *mask;
3713     int dst_stride, mask_stride;
3714     int32_t w;
3715     uint32_t m;
3716     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3717
3718     __m128i xmm_src, xmm_alpha;
3719     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3720     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3721
3722     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3723
3724     srca = src >> 24;
3725     if (src == 0)
3726         return;
3727
3728     PIXMAN_IMAGE_GET_LINE (
3729         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3730     PIXMAN_IMAGE_GET_LINE (
3731         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3732
3733     xmm_src = expand_pixel_32_1x128 (src);
3734     xmm_alpha = expand_alpha_1x128 (xmm_src);
3735     mmx_src = xmm_src;
3736     mmx_alpha = xmm_alpha;
3737
3738     while (height--)
3739     {
3740         dst = dst_line;
3741         dst_line += dst_stride;
3742         mask = mask_line;
3743         mask_line += mask_stride;
3744         w = width;
3745
3746         while (w && (unsigned long)dst & 15)
3747         {
3748             m = *mask++;
3749
3750             if (m)
3751             {
3752                 d = *dst;
3753                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3754                 mmx_dest = expand565_16_1x128 (d);
3755
3756                 *dst = pack_565_32_16 (
3757                     pack_1x128_32 (
3758                         in_over_1x128 (
3759                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3760             }
3761
3762             w--;
3763             dst++;
3764         }
3765
3766         while (w >= 8)
3767         {
3768             xmm_dst = load_128_aligned ((__m128i*) dst);
3769             unpack_565_128_4x128 (xmm_dst,
3770                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3771
3772             m = *((uint32_t*)mask);
3773             mask += 4;
3774
3775             if (m)
3776             {
3777                 xmm_mask = unpack_32_1x128 (m);
3778                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3779
3780                 /* Unpacking */
3781                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3782
3783                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3784                                         &xmm_mask_lo, &xmm_mask_hi);
3785
3786                 in_over_2x128 (&xmm_src, &xmm_src,
3787                                &xmm_alpha, &xmm_alpha,
3788                                &xmm_mask_lo, &xmm_mask_hi,
3789                                &xmm_dst0, &xmm_dst1);
3790             }
3791
3792             m = *((uint32_t*)mask);
3793             mask += 4;
3794
3795             if (m)
3796             {
3797                 xmm_mask = unpack_32_1x128 (m);
3798                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3799
3800                 /* Unpacking */
3801                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3802
3803                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3804                                         &xmm_mask_lo, &xmm_mask_hi);
3805                 in_over_2x128 (&xmm_src, &xmm_src,
3806                                &xmm_alpha, &xmm_alpha,
3807                                &xmm_mask_lo, &xmm_mask_hi,
3808                                &xmm_dst2, &xmm_dst3);
3809             }
3810
3811             save_128_aligned (
3812                 (__m128i*)dst, pack_565_4x128_128 (
3813                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3814
3815             w -= 8;
3816             dst += 8;
3817         }
3818
3819         while (w)
3820         {
3821             m = *mask++;
3822
3823             if (m)
3824             {
3825                 d = *dst;
3826                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3827                 mmx_dest = expand565_16_1x128 (d);
3828
3829                 *dst = pack_565_32_16 (
3830                     pack_1x128_32 (
3831                         in_over_1x128 (
3832                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3833             }
3834
3835             w--;
3836             dst++;
3837         }
3838     }
3839
3840 }
3841
3842 /* -----------------------------------------------------------------------
3843  * composite_over_pixbuf_0565
3844  */
3845
3846 static void
3847 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3848                                  pixman_op_t              op,
3849                                  pixman_image_t *         src_image,
3850                                  pixman_image_t *         mask_image,
3851                                  pixman_image_t *         dst_image,
3852                                  int32_t                  src_x,
3853                                  int32_t                  src_y,
3854                                  int32_t                  mask_x,
3855                                  int32_t                  mask_y,
3856                                  int32_t                  dest_x,
3857                                  int32_t                  dest_y,
3858                                  int32_t                  width,
3859                                  int32_t                  height)
3860 {
3861     uint16_t    *dst_line, *dst, d;
3862     uint32_t    *src_line, *src, s;
3863     int dst_stride, src_stride;
3864     int32_t w;
3865     uint32_t opaque, zero;
3866
3867     __m128i ms;
3868     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3869     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3870
3871     PIXMAN_IMAGE_GET_LINE (
3872         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3873     PIXMAN_IMAGE_GET_LINE (
3874         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3875
3876 #if 0
3877     /* FIXME
3878      *
3879      * I copy the code from MMX one and keep the fixme.
3880      * If it's a problem there, probably is a problem here.
3881      */
3882     assert (src_image->drawable == mask_image->drawable);
3883 #endif
3884
3885     while (height--)
3886     {
3887         dst = dst_line;
3888         dst_line += dst_stride;
3889         src = src_line;
3890         src_line += src_stride;
3891         w = width;
3892
3893         while (w && (unsigned long)dst & 15)
3894         {
3895             s = *src++;
3896             d = *dst;
3897
3898             ms = unpack_32_1x128 (s);
3899
3900             *dst++ = pack_565_32_16 (
3901                 pack_1x128_32 (
3902                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3903             w--;
3904         }
3905
3906         while (w >= 8)
3907         {
3908             /* First round */
3909             xmm_src = load_128_unaligned ((__m128i*)src);
3910             xmm_dst = load_128_aligned  ((__m128i*)dst);
3911
3912             opaque = is_opaque (xmm_src);
3913             zero = is_zero (xmm_src);
3914
3915             unpack_565_128_4x128 (xmm_dst,
3916                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3917             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3918
3919             /* preload next round*/
3920             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3921
3922             if (opaque)
3923             {
3924                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3925                                      &xmm_dst0, &xmm_dst1);
3926             }
3927             else if (!zero)
3928             {
3929                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3930                                         &xmm_dst0, &xmm_dst1);
3931             }
3932
3933             /* Second round */
3934             opaque = is_opaque (xmm_src);
3935             zero = is_zero (xmm_src);
3936
3937             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3938
3939             if (opaque)
3940             {
3941                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3942                                      &xmm_dst2, &xmm_dst3);
3943             }
3944             else if (!zero)
3945             {
3946                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3947                                         &xmm_dst2, &xmm_dst3);
3948             }
3949
3950             save_128_aligned (
3951                 (__m128i*)dst, pack_565_4x128_128 (
3952                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3953
3954             w -= 8;
3955             src += 8;
3956             dst += 8;
3957         }
3958
3959         while (w)
3960         {
3961             s = *src++;
3962             d = *dst;
3963
3964             ms = unpack_32_1x128 (s);
3965
3966             *dst++ = pack_565_32_16 (
3967                 pack_1x128_32 (
3968                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3969             w--;
3970         }
3971     }
3972
3973 }
3974
3975 /* -------------------------------------------------------------------------
3976  * composite_over_pixbuf_8888
3977  */
3978
3979 static void
3980 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3981                                  pixman_op_t              op,
3982                                  pixman_image_t *         src_image,
3983                                  pixman_image_t *         mask_image,
3984                                  pixman_image_t *         dst_image,
3985                                  int32_t                  src_x,
3986                                  int32_t                  src_y,
3987                                  int32_t                  mask_x,
3988                                  int32_t                  mask_y,
3989                                  int32_t                  dest_x,
3990                                  int32_t                  dest_y,
3991                                  int32_t                  width,
3992                                  int32_t                  height)
3993 {
3994     uint32_t    *dst_line, *dst, d;
3995     uint32_t    *src_line, *src, s;
3996     int dst_stride, src_stride;
3997     int32_t w;
3998     uint32_t opaque, zero;
3999
4000     __m128i xmm_src_lo, xmm_src_hi;
4001     __m128i xmm_dst_lo, xmm_dst_hi;
4002
4003     PIXMAN_IMAGE_GET_LINE (
4004         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4005     PIXMAN_IMAGE_GET_LINE (
4006         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4007
4008 #if 0
4009     /* FIXME
4010      *
4011      * I copy the code from MMX one and keep the fixme.
4012      * If it's a problem there, probably is a problem here.
4013      */
4014     assert (src_image->drawable == mask_image->drawable);
4015 #endif
4016
4017     while (height--)
4018     {
4019         dst = dst_line;
4020         dst_line += dst_stride;
4021         src = src_line;
4022         src_line += src_stride;
4023         w = width;
4024
4025         while (w && (unsigned long)dst & 15)
4026         {
4027             s = *src++;
4028             d = *dst;
4029
4030             *dst++ = pack_1x128_32 (
4031                 over_rev_non_pre_1x128 (
4032                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4033
4034             w--;
4035         }
4036
4037         while (w >= 4)
4038         {
4039             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4040
4041             opaque = is_opaque (xmm_src_hi);
4042             zero = is_zero (xmm_src_hi);
4043
4044             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4045
4046             if (opaque)
4047             {
4048                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4049                                      &xmm_dst_lo, &xmm_dst_hi);
4050
4051                 save_128_aligned (
4052                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4053             }
4054             else if (!zero)
4055             {
4056                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4057
4058                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4059
4060                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4061                                         &xmm_dst_lo, &xmm_dst_hi);
4062
4063                 save_128_aligned (
4064                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4065             }
4066
4067             w -= 4;
4068             dst += 4;
4069             src += 4;
4070         }
4071
4072         while (w)
4073         {
4074             s = *src++;
4075             d = *dst;
4076
4077             *dst++ = pack_1x128_32 (
4078                 over_rev_non_pre_1x128 (
4079                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4080
4081             w--;
4082         }
4083     }
4084
4085 }
4086
4087 /* -------------------------------------------------------------------------------------------------
4088  * composite_over_n_8888_0565_ca
4089  */
4090
4091 static void
4092 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4093                                     pixman_op_t              op,
4094                                     pixman_image_t *         src_image,
4095                                     pixman_image_t *         mask_image,
4096                                     pixman_image_t *         dst_image,
4097                                     int32_t                  src_x,
4098                                     int32_t                  src_y,
4099                                     int32_t                  mask_x,
4100                                     int32_t                  mask_y,
4101                                     int32_t                  dest_x,
4102                                     int32_t                  dest_y,
4103                                     int32_t                  width,
4104                                     int32_t                  height)
4105 {
4106     uint32_t src;
4107     uint16_t    *dst_line, *dst, d;
4108     uint32_t    *mask_line, *mask, m;
4109     int dst_stride, mask_stride;
4110     int w;
4111     uint32_t pack_cmp;
4112
4113     __m128i xmm_src, xmm_alpha;
4114     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4115     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4116
4117     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4118
4119     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4120
4121     if (src == 0)
4122         return;
4123
4124     PIXMAN_IMAGE_GET_LINE (
4125         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4126     PIXMAN_IMAGE_GET_LINE (
4127         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4128
4129     xmm_src = expand_pixel_32_1x128 (src);
4130     xmm_alpha = expand_alpha_1x128 (xmm_src);
4131     mmx_src = xmm_src;
4132     mmx_alpha = xmm_alpha;
4133
4134     while (height--)
4135     {
4136         w = width;
4137         mask = mask_line;
4138         dst = dst_line;
4139         mask_line += mask_stride;
4140         dst_line += dst_stride;
4141
4142         while (w && ((unsigned long)dst & 15))
4143         {
4144             m = *(uint32_t *) mask;
4145
4146             if (m)
4147             {
4148                 d = *dst;
4149                 mmx_mask = unpack_32_1x128 (m);
4150                 mmx_dest = expand565_16_1x128 (d);
4151
4152                 *dst = pack_565_32_16 (
4153                     pack_1x128_32 (
4154                         in_over_1x128 (
4155                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4156             }
4157
4158             w--;
4159             dst++;
4160             mask++;
4161         }
4162
4163         while (w >= 8)
4164         {
4165             /* First round */
4166             xmm_mask = load_128_unaligned ((__m128i*)mask);
4167             xmm_dst = load_128_aligned ((__m128i*)dst);
4168
4169             pack_cmp = _mm_movemask_epi8 (
4170                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4171
4172             unpack_565_128_4x128 (xmm_dst,
4173                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4174             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4175
4176             /* preload next round */
4177             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4178
4179             /* preload next round */
4180             if (pack_cmp != 0xffff)
4181             {
4182                 in_over_2x128 (&xmm_src, &xmm_src,
4183                                &xmm_alpha, &xmm_alpha,
4184                                &xmm_mask_lo, &xmm_mask_hi,
4185                                &xmm_dst0, &xmm_dst1);
4186             }
4187
4188             /* Second round */
4189             pack_cmp = _mm_movemask_epi8 (
4190                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4191
4192             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4193
4194             if (pack_cmp != 0xffff)
4195             {
4196                 in_over_2x128 (&xmm_src, &xmm_src,
4197                                &xmm_alpha, &xmm_alpha,
4198                                &xmm_mask_lo, &xmm_mask_hi,
4199                                &xmm_dst2, &xmm_dst3);
4200             }
4201
4202             save_128_aligned (
4203                 (__m128i*)dst, pack_565_4x128_128 (
4204                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4205
4206             w -= 8;
4207             dst += 8;
4208             mask += 8;
4209         }
4210
4211         while (w)
4212         {
4213             m = *(uint32_t *) mask;
4214
4215             if (m)
4216             {
4217                 d = *dst;
4218                 mmx_mask = unpack_32_1x128 (m);
4219                 mmx_dest = expand565_16_1x128 (d);
4220
4221                 *dst = pack_565_32_16 (
4222                     pack_1x128_32 (
4223                         in_over_1x128 (
4224                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4225             }
4226
4227             w--;
4228             dst++;
4229             mask++;
4230         }
4231     }
4232
4233 }
4234
4235 /* -----------------------------------------------------------------------
4236  * composite_in_n_8_8
4237  */
4238
4239 static void
4240 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4241                          pixman_op_t              op,
4242                          pixman_image_t *         src_image,
4243                          pixman_image_t *         mask_image,
4244                          pixman_image_t *         dst_image,
4245                          int32_t                  src_x,
4246                          int32_t                  src_y,
4247                          int32_t                  mask_x,
4248                          int32_t                  mask_y,
4249                          int32_t                  dest_x,
4250                          int32_t                  dest_y,
4251                          int32_t                  width,
4252                          int32_t                  height)
4253 {
4254     uint8_t     *dst_line, *dst;
4255     uint8_t     *mask_line, *mask;
4256     int dst_stride, mask_stride;
4257     uint32_t d, m;
4258     uint32_t src;
4259     uint8_t sa;
4260     int32_t w;
4261
4262     __m128i xmm_alpha;
4263     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4264     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4265
4266     PIXMAN_IMAGE_GET_LINE (
4267         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4268     PIXMAN_IMAGE_GET_LINE (
4269         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4270
4271     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4272
4273     sa = src >> 24;
4274
4275     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4276
4277     while (height--)
4278     {
4279         dst = dst_line;
4280         dst_line += dst_stride;
4281         mask = mask_line;
4282         mask_line += mask_stride;
4283         w = width;
4284
4285         while (w && ((unsigned long)dst & 15))
4286         {
4287             m = (uint32_t) *mask++;
4288             d = (uint32_t) *dst;
4289
4290             *dst++ = (uint8_t) pack_1x128_32 (
4291                 pix_multiply_1x128 (
4292                     pix_multiply_1x128 (xmm_alpha,
4293                                        unpack_32_1x128 (m)),
4294                     unpack_32_1x128 (d)));
4295             w--;
4296         }
4297
4298         while (w >= 16)
4299         {
4300             xmm_mask = load_128_unaligned ((__m128i*)mask);
4301             xmm_dst = load_128_aligned ((__m128i*)dst);
4302
4303             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4304             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4305
4306             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4307                                 &xmm_mask_lo, &xmm_mask_hi,
4308                                 &xmm_mask_lo, &xmm_mask_hi);
4309
4310             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4311                                 &xmm_dst_lo, &xmm_dst_hi,
4312                                 &xmm_dst_lo, &xmm_dst_hi);
4313
4314             save_128_aligned (
4315                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4316
4317             mask += 16;
4318             dst += 16;
4319             w -= 16;
4320         }
4321
4322         while (w)
4323         {
4324             m = (uint32_t) *mask++;
4325             d = (uint32_t) *dst;
4326
4327             *dst++ = (uint8_t) pack_1x128_32 (
4328                 pix_multiply_1x128 (
4329                     pix_multiply_1x128 (
4330                         xmm_alpha, unpack_32_1x128 (m)),
4331                     unpack_32_1x128 (d)));
4332             w--;
4333         }
4334     }
4335
4336 }
4337
4338 /* -----------------------------------------------------------------------
4339  * composite_in_n_8
4340  */
4341
4342 static void
4343 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4344                        pixman_op_t              op,
4345                        pixman_image_t *         src_image,
4346                        pixman_image_t *         mask_image,
4347                        pixman_image_t *         dst_image,
4348                        int32_t                  src_x,
4349                        int32_t                  src_y,
4350                        int32_t                  mask_x,
4351                        int32_t                  mask_y,
4352                        int32_t                  dest_x,
4353                        int32_t                  dest_y,
4354                        int32_t                  width,
4355                        int32_t                  height)
4356 {
4357     uint8_t     *dst_line, *dst;
4358     int dst_stride;
4359     uint32_t d;
4360     uint32_t src;
4361     int32_t w;
4362
4363     __m128i xmm_alpha;
4364     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4365
4366     PIXMAN_IMAGE_GET_LINE (
4367         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4368
4369     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4370
4371     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4372
4373     src = src >> 24;
4374
4375     if (src == 0xff)
4376         return;
4377
4378     if (src == 0x00)
4379     {
4380         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4381                      8, dest_x, dest_y, width, height, src);
4382
4383         return;
4384     }
4385
4386     while (height--)
4387     {
4388         dst = dst_line;
4389         dst_line += dst_stride;
4390         w = width;
4391
4392         while (w && ((unsigned long)dst & 15))
4393         {
4394             d = (uint32_t) *dst;
4395
4396             *dst++ = (uint8_t) pack_1x128_32 (
4397                 pix_multiply_1x128 (
4398                     xmm_alpha,
4399                     unpack_32_1x128 (d)));
4400             w--;
4401         }
4402
4403         while (w >= 16)
4404         {
4405             xmm_dst = load_128_aligned ((__m128i*)dst);
4406
4407             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4408
4409             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4410                                 &xmm_dst_lo, &xmm_dst_hi,
4411                                 &xmm_dst_lo, &xmm_dst_hi);
4412
4413             save_128_aligned (
4414                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4415
4416             dst += 16;
4417             w -= 16;
4418         }
4419
4420         while (w)
4421         {
4422             d = (uint32_t) *dst;
4423
4424             *dst++ = (uint8_t) pack_1x128_32 (
4425                 pix_multiply_1x128 (
4426                     xmm_alpha,
4427                     unpack_32_1x128 (d)));
4428             w--;
4429         }
4430     }
4431
4432 }
4433
4434 /* ---------------------------------------------------------------------------
4435  * composite_in_8_8
4436  */
4437
4438 static void
4439 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4440                        pixman_op_t              op,
4441                        pixman_image_t *         src_image,
4442                        pixman_image_t *         mask_image,
4443                        pixman_image_t *         dst_image,
4444                        int32_t                  src_x,
4445                        int32_t                  src_y,
4446                        int32_t                  mask_x,
4447                        int32_t                  mask_y,
4448                        int32_t                  dest_x,
4449                        int32_t                  dest_y,
4450                        int32_t                  width,
4451                        int32_t                  height)
4452 {
4453     uint8_t     *dst_line, *dst;
4454     uint8_t     *src_line, *src;
4455     int src_stride, dst_stride;
4456     int32_t w;
4457     uint32_t s, d;
4458
4459     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4460     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4461
4462     PIXMAN_IMAGE_GET_LINE (
4463         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4464     PIXMAN_IMAGE_GET_LINE (
4465         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4466
4467     while (height--)
4468     {
4469         dst = dst_line;
4470         dst_line += dst_stride;
4471         src = src_line;
4472         src_line += src_stride;
4473         w = width;
4474
4475         while (w && ((unsigned long)dst & 15))
4476         {
4477             s = (uint32_t) *src++;
4478             d = (uint32_t) *dst;
4479
4480             *dst++ = (uint8_t) pack_1x128_32 (
4481                 pix_multiply_1x128 (
4482                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4483             w--;
4484         }
4485
4486         while (w >= 16)
4487         {
4488             xmm_src = load_128_unaligned ((__m128i*)src);
4489             xmm_dst = load_128_aligned ((__m128i*)dst);
4490
4491             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4492             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4493
4494             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4495                                 &xmm_dst_lo, &xmm_dst_hi,
4496                                 &xmm_dst_lo, &xmm_dst_hi);
4497
4498             save_128_aligned (
4499                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4500
4501             src += 16;
4502             dst += 16;
4503             w -= 16;
4504         }
4505
4506         while (w)
4507         {
4508             s = (uint32_t) *src++;
4509             d = (uint32_t) *dst;
4510
4511             *dst++ = (uint8_t) pack_1x128_32 (
4512                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4513             w--;
4514         }
4515     }
4516
4517 }
4518
4519 /* -------------------------------------------------------------------------
4520  * composite_add_n_8_8
4521  */
4522
4523 static void
4524 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4525                           pixman_op_t              op,
4526                           pixman_image_t *         src_image,
4527                           pixman_image_t *         mask_image,
4528                           pixman_image_t *         dst_image,
4529                           int32_t                  src_x,
4530                           int32_t                  src_y,
4531                           int32_t                  mask_x,
4532                           int32_t                  mask_y,
4533                           int32_t                  dest_x,
4534                           int32_t                  dest_y,
4535                           int32_t                  width,
4536                           int32_t                  height)
4537 {
4538     uint8_t     *dst_line, *dst;
4539     uint8_t     *mask_line, *mask;
4540     int dst_stride, mask_stride;
4541     int32_t w;
4542     uint32_t src;
4543     uint8_t sa;
4544     uint32_t m, d;
4545
4546     __m128i xmm_alpha;
4547     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4548     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4549
4550     PIXMAN_IMAGE_GET_LINE (
4551         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4552     PIXMAN_IMAGE_GET_LINE (
4553         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4554
4555     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4556
4557     sa = src >> 24;
4558
4559     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4560
4561     while (height--)
4562     {
4563         dst = dst_line;
4564         dst_line += dst_stride;
4565         mask = mask_line;
4566         mask_line += mask_stride;
4567         w = width;
4568
4569         while (w && ((unsigned long)dst & 15))
4570         {
4571             m = (uint32_t) *mask++;
4572             d = (uint32_t) *dst;
4573
4574             *dst++ = (uint8_t) pack_1x128_32 (
4575                 _mm_adds_epu16 (
4576                     pix_multiply_1x128 (
4577                         xmm_alpha, unpack_32_1x128 (m)),
4578                     unpack_32_1x128 (d)));
4579             w--;
4580         }
4581
4582         while (w >= 16)
4583         {
4584             xmm_mask = load_128_unaligned ((__m128i*)mask);
4585             xmm_dst = load_128_aligned ((__m128i*)dst);
4586
4587             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4588             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4589
4590             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4591                                 &xmm_mask_lo, &xmm_mask_hi,
4592                                 &xmm_mask_lo, &xmm_mask_hi);
4593
4594             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4595             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4596
4597             save_128_aligned (
4598                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4599
4600             mask += 16;
4601             dst += 16;
4602             w -= 16;
4603         }
4604
4605         while (w)
4606         {
4607             m = (uint32_t) *mask++;
4608             d = (uint32_t) *dst;
4609
4610             *dst++ = (uint8_t) pack_1x128_32 (
4611                 _mm_adds_epu16 (
4612                     pix_multiply_1x128 (
4613                         xmm_alpha, unpack_32_1x128 (m)),
4614                     unpack_32_1x128 (d)));
4615
4616             w--;
4617         }
4618     }
4619
4620 }
4621
4622 /* -------------------------------------------------------------------------
4623  * composite_add_n_8_8
4624  */
4625
4626 static void
4627 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4628                         pixman_op_t              op,
4629                         pixman_image_t *         src_image,
4630                         pixman_image_t *         mask_image,
4631                         pixman_image_t *         dst_image,
4632                         int32_t                  src_x,
4633                         int32_t                  src_y,
4634                         int32_t                  mask_x,
4635                         int32_t                  mask_y,
4636                         int32_t                  dest_x,
4637                         int32_t                  dest_y,
4638                         int32_t                  width,
4639                         int32_t                  height)
4640 {
4641     uint8_t     *dst_line, *dst;
4642     int dst_stride;
4643     int32_t w;
4644     uint32_t src;
4645
4646     __m128i xmm_src;
4647
4648     PIXMAN_IMAGE_GET_LINE (
4649         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4650
4651     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4652
4653     src >>= 24;
4654
4655     if (src == 0x00)
4656         return;
4657
4658     if (src == 0xff)
4659     {
4660         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4661                      8, dest_x, dest_y, width, height, 0xff);
4662
4663         return;
4664     }
4665
4666     src = (src << 24) | (src << 16) | (src << 8) | src;
4667     xmm_src = _mm_set_epi32 (src, src, src, src);
4668
4669     while (height--)
4670     {
4671         dst = dst_line;
4672         dst_line += dst_stride;
4673         w = width;
4674
4675         while (w && ((unsigned long)dst & 15))
4676         {
4677             *dst = (uint8_t)_mm_cvtsi128_si32 (
4678                 _mm_adds_epu8 (
4679                     xmm_src,
4680                     _mm_cvtsi32_si128 (*dst)));
4681
4682             w--;
4683             dst++;
4684         }
4685
4686         while (w >= 16)
4687         {
4688             save_128_aligned (
4689                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4690
4691             dst += 16;
4692             w -= 16;
4693         }
4694
4695         while (w)
4696         {
4697             *dst = (uint8_t)_mm_cvtsi128_si32 (
4698                 _mm_adds_epu8 (
4699                     xmm_src,
4700                     _mm_cvtsi32_si128 (*dst)));
4701
4702             w--;
4703             dst++;
4704         }
4705     }
4706
4707 }
4708
4709 /* ----------------------------------------------------------------------
4710  * composite_add_8_8
4711  */
4712
4713 static void
4714 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4715                         pixman_op_t              op,
4716                         pixman_image_t *         src_image,
4717                         pixman_image_t *         mask_image,
4718                         pixman_image_t *         dst_image,
4719                         int32_t                  src_x,
4720                         int32_t                  src_y,
4721                         int32_t                  mask_x,
4722                         int32_t                  mask_y,
4723                         int32_t                  dest_x,
4724                         int32_t                  dest_y,
4725                         int32_t                  width,
4726                         int32_t                  height)
4727 {
4728     uint8_t     *dst_line, *dst;
4729     uint8_t     *src_line, *src;
4730     int dst_stride, src_stride;
4731     int32_t w;
4732     uint16_t t;
4733
4734     PIXMAN_IMAGE_GET_LINE (
4735         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4736     PIXMAN_IMAGE_GET_LINE (
4737         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4738
4739     while (height--)
4740     {
4741         dst = dst_line;
4742         src = src_line;
4743
4744         dst_line += dst_stride;
4745         src_line += src_stride;
4746         w = width;
4747
4748         /* Small head */
4749         while (w && (unsigned long)dst & 3)
4750         {
4751             t = (*dst) + (*src++);
4752             *dst++ = t | (0 - (t >> 8));
4753             w--;
4754         }
4755
4756         sse2_combine_add_u (imp, op,
4757                             (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4758
4759         /* Small tail */
4760         dst += w & 0xfffc;
4761         src += w & 0xfffc;
4762
4763         w &= 3;
4764
4765         while (w)
4766         {
4767             t = (*dst) + (*src++);
4768             *dst++ = t | (0 - (t >> 8));
4769             w--;
4770         }
4771     }
4772
4773 }
4774
4775 /* ---------------------------------------------------------------------
4776  * composite_add_8888_8888
4777  */
4778 static void
4779 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4780                               pixman_op_t              op,
4781                               pixman_image_t *         src_image,
4782                               pixman_image_t *         mask_image,
4783                               pixman_image_t *         dst_image,
4784                               int32_t                  src_x,
4785                               int32_t                  src_y,
4786                               int32_t                  mask_x,
4787                               int32_t                  mask_y,
4788                               int32_t                  dest_x,
4789                               int32_t                  dest_y,
4790                               int32_t                  width,
4791                               int32_t                  height)
4792 {
4793     uint32_t    *dst_line, *dst;
4794     uint32_t    *src_line, *src;
4795     int dst_stride, src_stride;
4796
4797     PIXMAN_IMAGE_GET_LINE (
4798         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4799     PIXMAN_IMAGE_GET_LINE (
4800         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4801
4802     while (height--)
4803     {
4804         dst = dst_line;
4805         dst_line += dst_stride;
4806         src = src_line;
4807         src_line += src_stride;
4808
4809         sse2_combine_add_u (imp, op, dst, src, NULL, width);
4810     }
4811
4812 }
4813
4814 /* -------------------------------------------------------------------------------------------------
4815  * sse2_composite_copy_area
4816  */
4817
4818 static pixman_bool_t
4819 pixman_blt_sse2 (uint32_t *src_bits,
4820                  uint32_t *dst_bits,
4821                  int       src_stride,
4822                  int       dst_stride,
4823                  int       src_bpp,
4824                  int       dst_bpp,
4825                  int       src_x,
4826                  int       src_y,
4827                  int       dst_x,
4828                  int       dst_y,
4829                  int       width,
4830                  int       height)
4831 {
4832     uint8_t *   src_bytes;
4833     uint8_t *   dst_bytes;
4834     int byte_width;
4835
4836     if (src_bpp != dst_bpp)
4837         return FALSE;
4838
4839     if (src_bpp == 16)
4840     {
4841         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4842         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4843         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4844         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4845         byte_width = 2 * width;
4846         src_stride *= 2;
4847         dst_stride *= 2;
4848     }
4849     else if (src_bpp == 32)
4850     {
4851         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4852         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4853         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4854         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4855         byte_width = 4 * width;
4856         src_stride *= 4;
4857         dst_stride *= 4;
4858     }
4859     else
4860     {
4861         return FALSE;
4862     }
4863
4864     while (height--)
4865     {
4866         int w;
4867         uint8_t *s = src_bytes;
4868         uint8_t *d = dst_bytes;
4869         src_bytes += src_stride;
4870         dst_bytes += dst_stride;
4871         w = byte_width;
4872
4873         while (w >= 2 && ((unsigned long)d & 3))
4874         {
4875             *(uint16_t *)d = *(uint16_t *)s;
4876             w -= 2;
4877             s += 2;
4878             d += 2;
4879         }
4880
4881         while (w >= 4 && ((unsigned long)d & 15))
4882         {
4883             *(uint32_t *)d = *(uint32_t *)s;
4884
4885             w -= 4;
4886             s += 4;
4887             d += 4;
4888         }
4889
4890         while (w >= 64)
4891         {
4892             __m128i xmm0, xmm1, xmm2, xmm3;
4893
4894             xmm0 = load_128_unaligned ((__m128i*)(s));
4895             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4896             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4897             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4898
4899             save_128_aligned ((__m128i*)(d),    xmm0);
4900             save_128_aligned ((__m128i*)(d + 16), xmm1);
4901             save_128_aligned ((__m128i*)(d + 32), xmm2);
4902             save_128_aligned ((__m128i*)(d + 48), xmm3);
4903
4904             s += 64;
4905             d += 64;
4906             w -= 64;
4907         }
4908
4909         while (w >= 16)
4910         {
4911             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4912
4913             w -= 16;
4914             d += 16;
4915             s += 16;
4916         }
4917
4918         while (w >= 4)
4919         {
4920             *(uint32_t *)d = *(uint32_t *)s;
4921
4922             w -= 4;
4923             s += 4;
4924             d += 4;
4925         }
4926
4927         if (w >= 2)
4928         {
4929             *(uint16_t *)d = *(uint16_t *)s;
4930             w -= 2;
4931             s += 2;
4932             d += 2;
4933         }
4934     }
4935
4936
4937     return TRUE;
4938 }
4939
4940 static void
4941 sse2_composite_copy_area (pixman_implementation_t *imp,
4942                           pixman_op_t              op,
4943                           pixman_image_t *         src_image,
4944                           pixman_image_t *         mask_image,
4945                           pixman_image_t *         dst_image,
4946                           int32_t                  src_x,
4947                           int32_t                  src_y,
4948                           int32_t                  mask_x,
4949                           int32_t                  mask_y,
4950                           int32_t                  dest_x,
4951                           int32_t                  dest_y,
4952                           int32_t                  width,
4953                           int32_t                  height)
4954 {
4955     pixman_blt_sse2 (src_image->bits.bits,
4956                      dst_image->bits.bits,
4957                      src_image->bits.rowstride,
4958                      dst_image->bits.rowstride,
4959                      PIXMAN_FORMAT_BPP (src_image->bits.format),
4960                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
4961                      src_x, src_y, dest_x, dest_y, width, height);
4962 }
4963
4964 static void
4965 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4966                                  pixman_op_t              op,
4967                                  pixman_image_t *         src_image,
4968                                  pixman_image_t *         mask_image,
4969                                  pixman_image_t *         dst_image,
4970                                  int32_t                  src_x,
4971                                  int32_t                  src_y,
4972                                  int32_t                  mask_x,
4973                                  int32_t                  mask_y,
4974                                  int32_t                  dest_x,
4975                                  int32_t                  dest_y,
4976                                  int32_t                  width,
4977                                  int32_t                  height)
4978 {
4979     uint32_t    *src, *src_line, s;
4980     uint32_t    *dst, *dst_line, d;
4981     uint8_t         *mask, *mask_line;
4982     uint32_t m;
4983     int src_stride, mask_stride, dst_stride;
4984     int32_t w;
4985     __m128i ms;
4986
4987     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4988     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4989     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4990
4991     PIXMAN_IMAGE_GET_LINE (
4992         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4993     PIXMAN_IMAGE_GET_LINE (
4994         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4995     PIXMAN_IMAGE_GET_LINE (
4996         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4997
4998     while (height--)
4999     {
5000         src = src_line;
5001         src_line += src_stride;
5002         dst = dst_line;
5003         dst_line += dst_stride;
5004         mask = mask_line;
5005         mask_line += mask_stride;
5006
5007         w = width;
5008
5009         while (w && (unsigned long)dst & 15)
5010         {
5011             s = 0xff000000 | *src++;
5012             m = (uint32_t) *mask++;
5013             d = *dst;
5014             ms = unpack_32_1x128 (s);
5015
5016             if (m != 0xff)
5017             {
5018                 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
5019                 __m128i md = unpack_32_1x128 (d);
5020
5021                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
5022             }
5023
5024             *dst++ = pack_1x128_32 (ms);
5025             w--;
5026         }
5027
5028         while (w >= 4)
5029         {
5030             m = *(uint32_t*) mask;
5031             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5032
5033             if (m == 0xffffffff)
5034             {
5035                 save_128_aligned ((__m128i*)dst, xmm_src);
5036             }
5037             else
5038             {
5039                 xmm_dst = load_128_aligned ((__m128i*)dst);
5040
5041                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5042
5043                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5044                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5045                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5046
5047                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5048
5049                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5050
5051                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5052             }
5053
5054             src += 4;
5055             dst += 4;
5056             mask += 4;
5057             w -= 4;
5058         }
5059
5060         while (w)
5061         {
5062             m = (uint32_t) *mask++;
5063
5064             if (m)
5065             {
5066                 s = 0xff000000 | *src;
5067
5068                 if (m == 0xff)
5069                 {
5070                     *dst = s;
5071                 }
5072                 else
5073                 {
5074                     __m128i ma, md, ms;
5075
5076                     d = *dst;
5077
5078                     ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
5079                     md = unpack_32_1x128 (d);
5080                     ms = unpack_32_1x128 (s);
5081
5082                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
5083                 }
5084
5085             }
5086
5087             src++;
5088             dst++;
5089             w--;
5090         }
5091     }
5092
5093 }
5094
5095 static void
5096 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5097                                  pixman_op_t              op,
5098                                  pixman_image_t *         src_image,
5099                                  pixman_image_t *         mask_image,
5100                                  pixman_image_t *         dst_image,
5101                                  int32_t                  src_x,
5102                                  int32_t                  src_y,
5103                                  int32_t                  mask_x,
5104                                  int32_t                  mask_y,
5105                                  int32_t                  dest_x,
5106                                  int32_t                  dest_y,
5107                                  int32_t                  width,
5108                                  int32_t                  height)
5109 {
5110     uint32_t    *src, *src_line, s;
5111     uint32_t    *dst, *dst_line, d;
5112     uint8_t         *mask, *mask_line;
5113     uint32_t m;
5114     int src_stride, mask_stride, dst_stride;
5115     int32_t w;
5116
5117     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5118     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5119     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5120
5121     PIXMAN_IMAGE_GET_LINE (
5122         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5123     PIXMAN_IMAGE_GET_LINE (
5124         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5125     PIXMAN_IMAGE_GET_LINE (
5126         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5127
5128     while (height--)
5129     {
5130         src = src_line;
5131         src_line += src_stride;
5132         dst = dst_line;
5133         dst_line += dst_stride;
5134         mask = mask_line;
5135         mask_line += mask_stride;
5136
5137         w = width;
5138
5139         while (w && (unsigned long)dst & 15)
5140         {
5141             uint32_t sa;
5142
5143             s = *src++;
5144             m = (uint32_t) *mask++;
5145             d = *dst;
5146
5147             sa = s >> 24;
5148
5149             if (m)
5150             {
5151                 if (sa == 0xff && m == 0xff)
5152                 {
5153                     *dst = s;
5154                 }
5155                 else
5156                 {
5157                     __m128i ms, md, ma, msa;
5158
5159                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5160                     ms = unpack_32_1x128 (s);
5161                     md = unpack_32_1x128 (d);
5162
5163                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5164
5165                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5166                 }
5167             }
5168
5169             dst++;
5170             w--;
5171         }
5172
5173         while (w >= 4)
5174         {
5175             m = *(uint32_t *) mask;
5176
5177             if (m)
5178             {
5179                 xmm_src = load_128_unaligned ((__m128i*)src);
5180
5181                 if (m == 0xffffffff && is_opaque (xmm_src))
5182                 {
5183                     save_128_aligned ((__m128i *)dst, xmm_src);
5184                 }
5185                 else
5186                 {
5187                     xmm_dst = load_128_aligned ((__m128i *)dst);
5188
5189                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5190
5191                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5192                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5193                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5194
5195                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5196                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5197
5198                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5199                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5200
5201                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5202                 }
5203             }
5204
5205             src += 4;
5206             dst += 4;
5207             mask += 4;
5208             w -= 4;
5209         }
5210
5211         while (w)
5212         {
5213             uint32_t sa;
5214
5215             s = *src++;
5216             m = (uint32_t) *mask++;
5217             d = *dst;
5218
5219             sa = s >> 24;
5220
5221             if (m)
5222             {
5223                 if (sa == 0xff && m == 0xff)
5224                 {
5225                     *dst = s;
5226                 }
5227                 else
5228                 {
5229                     __m128i ms, md, ma, msa;
5230
5231                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5232                     ms = unpack_32_1x128 (s);
5233                     md = unpack_32_1x128 (d);
5234
5235                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5236
5237                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5238                 }
5239             }
5240
5241             dst++;
5242             w--;
5243         }
5244     }
5245
5246 }
5247
5248 static void
5249 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5250                                     pixman_op_t              op,
5251                                     pixman_image_t *         src_image,
5252                                     pixman_image_t *         mask_image,
5253                                     pixman_image_t *         dst_image,
5254                                     int32_t                  src_x,
5255                                     int32_t                  src_y,
5256                                     int32_t                  mask_x,
5257                                     int32_t                  mask_y,
5258                                     int32_t                  dest_x,
5259                                     int32_t                  dest_y,
5260                                     int32_t                  width,
5261                                     int32_t                  height)
5262 {
5263     uint32_t src;
5264     uint32_t    *dst_line, *dst;
5265     __m128i xmm_src;
5266     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5267     __m128i xmm_dsta_hi, xmm_dsta_lo;
5268     int dst_stride;
5269     int32_t w;
5270
5271     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5272
5273     if (src == 0)
5274         return;
5275
5276     PIXMAN_IMAGE_GET_LINE (
5277         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5278
5279     xmm_src = expand_pixel_32_1x128 (src);
5280
5281     while (height--)
5282     {
5283         dst = dst_line;
5284
5285         dst_line += dst_stride;
5286         w = width;
5287
5288         while (w && (unsigned long)dst & 15)
5289         {
5290             __m128i vd;
5291
5292             vd = unpack_32_1x128 (*dst);
5293
5294             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5295                                               xmm_src));
5296             w--;
5297             dst++;
5298         }
5299
5300         while (w >= 4)
5301         {
5302             __m128i tmp_lo, tmp_hi;
5303
5304             xmm_dst = load_128_aligned ((__m128i*)dst);
5305
5306             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5307             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5308
5309             tmp_lo = xmm_src;
5310             tmp_hi = xmm_src;
5311
5312             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5313                         &xmm_dsta_lo, &xmm_dsta_hi,
5314                         &tmp_lo, &tmp_hi);
5315
5316             save_128_aligned (
5317                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5318
5319             w -= 4;
5320             dst += 4;
5321         }
5322
5323         while (w)
5324         {
5325             __m128i vd;
5326
5327             vd = unpack_32_1x128 (*dst);
5328
5329             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5330                                               xmm_src));
5331             w--;
5332             dst++;
5333         }
5334
5335     }
5336
5337 }
5338
5339 static void
5340 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5341                                     pixman_op_t              op,
5342                                     pixman_image_t *         src_image,
5343                                     pixman_image_t *         mask_image,
5344                                     pixman_image_t *         dst_image,
5345                                     int32_t                  src_x,
5346                                     int32_t                  src_y,
5347                                     int32_t                  mask_x,
5348                                     int32_t                  mask_y,
5349                                     int32_t                  dest_x,
5350                                     int32_t                  dest_y,
5351                                     int32_t                  width,
5352                                     int32_t                  height)
5353 {
5354     uint32_t    *src, *src_line, s;
5355     uint32_t    *dst, *dst_line, d;
5356     uint32_t    *mask, *mask_line;
5357     uint32_t    m;
5358     int src_stride, mask_stride, dst_stride;
5359     int32_t w;
5360
5361     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5362     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5363     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5364
5365     PIXMAN_IMAGE_GET_LINE (
5366         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5367     PIXMAN_IMAGE_GET_LINE (
5368         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5369     PIXMAN_IMAGE_GET_LINE (
5370         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5371
5372     while (height--)
5373     {
5374         src = src_line;
5375         src_line += src_stride;
5376         dst = dst_line;
5377         dst_line += dst_stride;
5378         mask = mask_line;
5379         mask_line += mask_stride;
5380
5381         w = width;
5382
5383         while (w && (unsigned long)dst & 15)
5384         {
5385             uint32_t sa;
5386
5387             s = *src++;
5388             m = (*mask++) >> 24;
5389             d = *dst;
5390
5391             sa = s >> 24;
5392
5393             if (m)
5394             {
5395                 if (sa == 0xff && m == 0xff)
5396                 {
5397                     *dst = s;
5398                 }
5399                 else
5400                 {
5401                     __m128i ms, md, ma, msa;
5402
5403                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5404                     ms = unpack_32_1x128 (s);
5405                     md = unpack_32_1x128 (d);
5406
5407                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5408
5409                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5410                 }
5411             }
5412
5413             dst++;
5414             w--;
5415         }
5416
5417         while (w >= 4)
5418         {
5419             xmm_mask = load_128_unaligned ((__m128i*)mask);
5420
5421             if (!is_transparent (xmm_mask))
5422             {
5423                 xmm_src = load_128_unaligned ((__m128i*)src);
5424
5425                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5426                 {
5427                     save_128_aligned ((__m128i *)dst, xmm_src);
5428                 }
5429                 else
5430                 {
5431                     xmm_dst = load_128_aligned ((__m128i *)dst);
5432
5433                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5434                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5435                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5436
5437                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5438                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5439
5440                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5441                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5442
5443                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5444                 }
5445             }
5446
5447             src += 4;
5448             dst += 4;
5449             mask += 4;
5450             w -= 4;
5451         }
5452
5453         while (w)
5454         {
5455             uint32_t sa;
5456
5457             s = *src++;
5458             m = (*mask++) >> 24;
5459             d = *dst;
5460
5461             sa = s >> 24;
5462
5463             if (m)
5464             {
5465                 if (sa == 0xff && m == 0xff)
5466                 {
5467                     *dst = s;
5468                 }
5469                 else
5470                 {
5471                     __m128i ms, md, ma, msa;
5472
5473                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5474                     ms = unpack_32_1x128 (s);
5475                     md = unpack_32_1x128 (d);
5476
5477                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5478
5479                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5480                 }
5481             }
5482
5483             dst++;
5484             w--;
5485         }
5486     }
5487
5488 }
5489
5490 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5491 static force_inline void
5492 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5493                                              const uint32_t* ps,
5494                                              int32_t         w,
5495                                              pixman_fixed_t  vx,
5496                                              pixman_fixed_t  unit_x,
5497                                              pixman_fixed_t  max_vx,
5498                                              pixman_bool_t   fully_transparent_src)
5499 {
5500     uint32_t s, d;
5501     const uint32_t* pm = NULL;
5502
5503     __m128i xmm_dst_lo, xmm_dst_hi;
5504     __m128i xmm_src_lo, xmm_src_hi;
5505     __m128i xmm_alpha_lo, xmm_alpha_hi;
5506
5507     if (fully_transparent_src)
5508         return;
5509
5510     /* Align dst on a 16-byte boundary */
5511     while (w && ((unsigned long)pd & 15))
5512     {
5513         d = *pd;
5514         s = combine1 (ps + (vx >> 16), pm);
5515         vx += unit_x;
5516
5517         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5518         if (pm)
5519             pm++;
5520         w--;
5521     }
5522
5523     while (w >= 4)
5524     {
5525         __m128i tmp;
5526         uint32_t tmp1, tmp2, tmp3, tmp4;
5527
5528         tmp1 = ps[vx >> 16];
5529         vx += unit_x;
5530         tmp2 = ps[vx >> 16];
5531         vx += unit_x;
5532         tmp3 = ps[vx >> 16];
5533         vx += unit_x;
5534         tmp4 = ps[vx >> 16];
5535         vx += unit_x;
5536
5537         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5538
5539         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5540
5541         if (is_opaque (xmm_src_hi))
5542         {
5543             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5544         }
5545         else if (!is_zero (xmm_src_hi))
5546         {
5547             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5548
5549             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5550             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5551
5552             expand_alpha_2x128 (
5553                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5554
5555             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5556                         &xmm_alpha_lo, &xmm_alpha_hi,
5557                         &xmm_dst_lo, &xmm_dst_hi);
5558
5559             /* rebuid the 4 pixel data and save*/
5560             save_128_aligned ((__m128i*)pd,
5561                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5562         }
5563
5564         w -= 4;
5565         pd += 4;
5566         if (pm)
5567             pm += 4;
5568     }
5569
5570     while (w)
5571     {
5572         d = *pd;
5573         s = combine1 (ps + (vx >> 16), pm);
5574         vx += unit_x;
5575
5576         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5577         if (pm)
5578             pm++;
5579
5580         w--;
5581     }
5582 }
5583
5584 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5585                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5586                        uint32_t, uint32_t, COVER)
5587 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5588                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5589                        uint32_t, uint32_t, NONE)
5590 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5591                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5592                        uint32_t, uint32_t, PAD)
5593
5594 static force_inline void
5595 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5596                                                uint32_t *       dst,
5597                                                const uint32_t * src,
5598                                                int32_t          w,
5599                                                pixman_fixed_t   vx,
5600                                                pixman_fixed_t   unit_x,
5601                                                pixman_fixed_t   max_vx,
5602                                                pixman_bool_t    zero_src)
5603 {
5604     __m128i xmm_mask;
5605     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5606     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5607     __m128i xmm_alpha_lo, xmm_alpha_hi;
5608
5609     if (zero_src || (*mask >> 24) == 0)
5610         return;
5611
5612     xmm_mask = create_mask_16_128 (*mask >> 24);
5613
5614     while (w && (unsigned long)dst & 15)
5615     {
5616         uint32_t s = src[pixman_fixed_to_int (vx)];
5617         vx += unit_x;
5618
5619         if (s)
5620         {
5621             uint32_t d = *dst;
5622
5623             __m128i ms = unpack_32_1x128 (s);
5624             __m128i alpha     = expand_alpha_1x128 (ms);
5625             __m128i dest      = xmm_mask;
5626             __m128i alpha_dst = unpack_32_1x128 (d);
5627
5628             *dst = pack_1x128_32 (
5629                 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5630         }
5631         dst++;
5632         w--;
5633     }
5634
5635     while (w >= 4)
5636     {
5637         uint32_t tmp1, tmp2, tmp3, tmp4;
5638
5639         tmp1 = src[pixman_fixed_to_int (vx)];
5640         vx += unit_x;
5641         tmp2 = src[pixman_fixed_to_int (vx)];
5642         vx += unit_x;
5643         tmp3 = src[pixman_fixed_to_int (vx)];
5644         vx += unit_x;
5645         tmp4 = src[pixman_fixed_to_int (vx)];
5646         vx += unit_x;
5647
5648         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5649
5650         if (!is_zero (xmm_src))
5651         {
5652             xmm_dst = load_128_aligned ((__m128i*)dst);
5653
5654             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5655             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5656             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5657                                 &xmm_alpha_lo, &xmm_alpha_hi);
5658
5659             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5660                            &xmm_alpha_lo, &xmm_alpha_hi,
5661                            &xmm_mask, &xmm_mask,
5662                            &xmm_dst_lo, &xmm_dst_hi);
5663
5664             save_128_aligned (
5665                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5666         }
5667
5668         dst += 4;
5669         w -= 4;
5670     }
5671
5672     while (w)
5673     {
5674         uint32_t s = src[pixman_fixed_to_int (vx)];
5675         vx += unit_x;
5676
5677         if (s)
5678         {
5679             uint32_t d = *dst;
5680
5681             __m128i ms = unpack_32_1x128 (s);
5682             __m128i alpha = expand_alpha_1x128 (ms);
5683             __m128i mask  = xmm_mask;
5684             __m128i dest  = unpack_32_1x128 (d);
5685
5686             *dst = pack_1x128_32 (
5687                 in_over_1x128 (&ms, &alpha, &mask, &dest));
5688         }
5689
5690         dst++;
5691         w--;
5692     }
5693
5694 }
5695
5696 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5697                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5698                               uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5699 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5700                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5701                               uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5702 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5703                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5704                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5705
5706 static const pixman_fast_path_t sse2_fast_paths[] =
5707 {
5708     /* PIXMAN_OP_OVER */
5709     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5710     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5711     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5712     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5713     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5714     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5715     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5716     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5717     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5718     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5719     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5720     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5721     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5722     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5723     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5724     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5725     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5726     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5727     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5728     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5729     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5730     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5731     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5732     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5733     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5734     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5735     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5736     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5737     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5738     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5739     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5740     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5741     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5742     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5743     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5744     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5745     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5746     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5747     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5748     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5749     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5750     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5751     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5752     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5753     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5754     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5755
5756     /* PIXMAN_OP_OVER_REVERSE */
5757     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5758     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5759
5760     /* PIXMAN_OP_ADD */
5761     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5762     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5763     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5764     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5765     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5766     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5767
5768     /* PIXMAN_OP_SRC */
5769     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5770     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5771     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5772     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5773     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5774     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5775     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5776     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5777     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5778     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5779     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5780     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5781     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5782     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5783
5784     /* PIXMAN_OP_IN */
5785     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5786     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5787     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5788
5789     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5790     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5791     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5792     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5793     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5794     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5795     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5796     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5797     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5798     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5799     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5800     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5801
5802     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
5803     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
5804     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
5805     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
5806
5807     { PIXMAN_OP_NONE },
5808 };
5809
5810 static pixman_bool_t
5811 sse2_blt (pixman_implementation_t *imp,
5812           uint32_t *               src_bits,
5813           uint32_t *               dst_bits,
5814           int                      src_stride,
5815           int                      dst_stride,
5816           int                      src_bpp,
5817           int                      dst_bpp,
5818           int                      src_x,
5819           int                      src_y,
5820           int                      dst_x,
5821           int                      dst_y,
5822           int                      width,
5823           int                      height)
5824 {
5825     if (!pixman_blt_sse2 (
5826             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5827             src_x, src_y, dst_x, dst_y, width, height))
5828
5829     {
5830         return _pixman_implementation_blt (
5831             imp->delegate,
5832             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5833             src_x, src_y, dst_x, dst_y, width, height);
5834     }
5835
5836     return TRUE;
5837 }
5838
5839 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5840 __attribute__((__force_align_arg_pointer__))
5841 #endif
5842 static pixman_bool_t
5843 sse2_fill (pixman_implementation_t *imp,
5844            uint32_t *               bits,
5845            int                      stride,
5846            int                      bpp,
5847            int                      x,
5848            int                      y,
5849            int                      width,
5850            int                      height,
5851            uint32_t xor)
5852 {
5853     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5854     {
5855         return _pixman_implementation_fill (
5856             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5857     }
5858
5859     return TRUE;
5860 }
5861
5862 static uint32_t *
5863 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5864 {
5865     int w = iter->width;
5866     __m128i ff000000 = mask_ff000000;
5867     uint32_t *dst = iter->buffer;
5868     uint32_t *src = (uint32_t *)iter->bits;
5869
5870     iter->bits += iter->stride;
5871
5872     while (w && ((unsigned long)dst) & 0x0f)
5873     {
5874         *dst++ = (*src++) | 0xff000000;
5875         w--;
5876     }
5877
5878     while (w >= 4)
5879     {
5880         save_128_aligned (
5881             (__m128i *)dst, _mm_or_si128 (
5882                 load_128_unaligned ((__m128i *)src), ff000000));
5883
5884         dst += 4;
5885         src += 4;
5886         w -= 4;
5887     }
5888
5889     while (w)
5890     {
5891         *dst++ = (*src++) | 0xff000000;
5892         w--;
5893     }
5894
5895     return iter->buffer;
5896 }
5897
5898 static uint32_t *
5899 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
5900 {
5901     int w = iter->width;
5902     uint32_t *dst = iter->buffer;
5903     uint16_t *src = (uint16_t *)iter->bits;
5904     __m128i ff000000 = mask_ff000000;
5905
5906     iter->bits += iter->stride;
5907
5908     while (w && ((unsigned long)dst) & 0x0f)
5909     {
5910         uint16_t s = *src++;
5911
5912         *dst++ = CONVERT_0565_TO_8888 (s);
5913         w--;
5914     }
5915
5916     while (w >= 8)
5917     {
5918         __m128i lo, hi, s;
5919
5920         s = _mm_loadu_si128 ((__m128i *)src);
5921
5922         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
5923         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
5924
5925         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
5926         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
5927
5928         dst += 8;
5929         src += 8;
5930         w -= 8;
5931     }
5932
5933     while (w)
5934     {
5935         uint16_t s = *src++;
5936
5937         *dst++ = CONVERT_0565_TO_8888 (s);
5938         w--;
5939     }
5940
5941     return iter->buffer;
5942 }
5943
5944 static uint32_t *
5945 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5946 {
5947     int w = iter->width;
5948     uint32_t *dst = iter->buffer;
5949     uint8_t *src = iter->bits;
5950     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5951
5952     iter->bits += iter->stride;
5953
5954     while (w && (((unsigned long)dst) & 15))
5955     {
5956         *dst++ = *(src++) << 24;
5957         w--;
5958     }
5959
5960     while (w >= 16)
5961     {
5962         xmm0 = _mm_loadu_si128((__m128i *)src);
5963
5964         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
5965         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
5966         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
5967         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
5968         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
5969         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
5970
5971         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
5972         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
5973         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
5974         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
5975
5976         dst += 16;
5977         src += 16;
5978         w -= 16;
5979     }
5980
5981     while (w)
5982     {
5983         *dst++ = *(src++) << 24;
5984         w--;
5985     }
5986
5987     return iter->buffer;
5988 }
5989
5990 typedef struct
5991 {
5992     pixman_format_code_t        format;
5993     pixman_iter_get_scanline_t  get_scanline;
5994 } fetcher_info_t;
5995
5996 static const fetcher_info_t fetchers[] =
5997 {
5998     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
5999     { PIXMAN_r5g6b5,            sse2_fetch_r5g6b5 },
6000     { PIXMAN_a8,                sse2_fetch_a8 },
6001     { PIXMAN_null }
6002 };
6003
6004 static void
6005 sse2_src_iter_init (pixman_implementation_t *imp,
6006                     pixman_iter_t *iter,
6007                     pixman_image_t *image,
6008                     int x, int y, int width, int height,
6009                     uint8_t *buffer, iter_flags_t flags)
6010 {
6011 #define FLAGS                                                           \
6012     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
6013
6014     if ((flags & ITER_NARROW)                           &&
6015         (image->common.flags & FLAGS) == FLAGS          &&
6016         x >= 0 && y >= 0                                &&
6017         x + width <= image->bits.width                  &&
6018         y + height <= image->bits.height)
6019     {
6020         const fetcher_info_t *f;
6021
6022         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6023         {
6024             if (image->common.extended_format_code == f->format)
6025             {
6026                 uint8_t *b = (uint8_t *)image->bits.bits;
6027                 int s = image->bits.rowstride * 4;
6028
6029                 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6030                 iter->stride = s;
6031                 iter->width = width;
6032                 iter->buffer = (uint32_t *)buffer;
6033
6034                 iter->get_scanline = f->get_scanline;
6035                 return;
6036             }
6037         }
6038     }
6039
6040     _pixman_implementation_src_iter_init (
6041         imp->delegate, iter, image, x, y, width, height, buffer, flags);
6042 }
6043
6044 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6045 __attribute__((__force_align_arg_pointer__))
6046 #endif
6047 pixman_implementation_t *
6048 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6049 {
6050     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6051
6052     /* SSE2 constants */
6053     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6054     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6055     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6056     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6057     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6058     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6059     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6060     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6061     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6062     mask_0080 = create_mask_16_128 (0x0080);
6063     mask_00ff = create_mask_16_128 (0x00ff);
6064     mask_0101 = create_mask_16_128 (0x0101);
6065     mask_ffff = create_mask_16_128 (0xffff);
6066     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6067     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6068
6069
6070     /* Set up function pointers */
6071
6072     /* SSE code patch for fbcompose.c */
6073     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6074     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6075     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6076     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6077     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6078     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6079     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6080     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6081     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6082     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6083
6084     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6085
6086     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6087     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6088     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6089     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6090     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6091     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6092     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6093     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6094     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6095     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6096     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6097
6098     imp->blt = sse2_blt;
6099     imp->fill = sse2_fill;
6100
6101     imp->src_iter_init = sse2_src_iter_init;
6102
6103     return imp;
6104 }
6105
6106 #endif /* USE_SSE2 */