pixman/pixman-mmx.c

   1 /*
   2  * Copyright © 2004, 2005 Red Hat, Inc.
   3  * Copyright © 2004 Nicholas Miell
   4  * Copyright © 2005 Trolltech AS
   5  *
   6  * Permission to use, copy, modify, distribute, and sell this software and its
   7  * documentation for any purpose is hereby granted without fee, provided that
   8  * the above copyright notice appear in all copies and that both that
   9  * copyright notice and this permission notice appear in supporting
  10  * documentation, and that the name of Red Hat not be used in advertising or
  11  * publicity pertaining to distribution of the software without specific,
  12  * written prior permission.  Red Hat makes no representations about the
  13  * suitability of this software for any purpose.  It is provided "as is"
  14  * without express or implied warranty.
  15  *
  16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  23  * SOFTWARE.
  24  *
  25  * Author:  Søren Sandmann (sandmann@redhat.com)
  26  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
  27  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
  28  *
  29  * Based on work by Owen Taylor
  30  */
  31
  32 #ifdef HAVE_CONFIG_H
  33 #include <config.h>
  34 #endif
  35
  36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT
  37
  38 #include <mmintrin.h>
  39 #include "pixman-private.h"
  40 #include "pixman-combine32.h"
  41
  42 #define no_vERBOSE
  43
  44 #ifdef VERBOSE
  45 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
  46 #else
  47 #define CHECKPOINT()
  48 #endif
  49
  50 #ifdef USE_ARM_IWMMXT
  51 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
  52 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  53 _mm_empty (void)
  54 {
  55
  56 }
  57 #endif
  58
  59 #ifdef USE_X86_MMX
  60 # if (defined(__SUNPRO_C) || defined(_MSC_VER))
  61 #  include <xmmintrin.h>
  62 # else
  63 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
  64  * instructions to be generated that we don't want. Just duplicate the
  65  * functions we want to use.  */
  66 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  67 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
  68 {
  69     asm ("pmulhuw %1, %0\n\t"
  70         : "+y" (__A)
  71         : "y" (__B)
  72     );
  73     return __A;
  74 }
  75
  76 #  ifdef __OPTIMIZE__
  77 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  78 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
  79 {
  80     __m64 ret;
  81
  82     asm ("pshufw %2, %1, %0\n\t"
  83         : "=y" (ret)
  84         : "y" (__A), "K" (__N)
  85     );
  86
  87     return ret;
  88 }
  89 #  else
  90 #   define _mm_shuffle_pi16(A, N) \
  91     ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
  92 #  endif
  93 # endif
  94 #endif
  95
  96 #ifndef _MSC_VER
  97 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
  98  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
  99 #endif
 100
 101 /* Notes about writing mmx code
 102  *
 103  * give memory operands as the second operand. If you give it as the
 104  * first, gcc will first load it into a register, then use that
 105  * register
 106  *
 107  *   ie. use
 108  *
 109  *         _mm_mullo_pi16 (x, mmx_constant);
 110  *
 111  *   not
 112  *
 113  *         _mm_mullo_pi16 (mmx_constant, x);
 114  *
 115  * Also try to minimize dependencies. i.e. when you need a value, try
 116  * to calculate it from a value that was calculated as early as
 117  * possible.
 118  */
 119
 120 /* --------------- MMX primitives ------------------------------------- */
 121
 122 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
 123  * the name of the member used to access the data.
 124  * If __m64 requires using mm_cvt* intrinsics functions to convert between
 125  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
 126  * If __m64 and uint64_t values can just be cast to each other directly,
 127  * then define USE_M64_CASTS.
 128  */
 129 #ifdef _MSC_VER
 130 # define M64_MEMBER m64_u64
 131 #elif defined(__ICC)
 132 # define USE_CVT_INTRINSICS
 133 #elif defined(__GNUC__)
 134 # define USE_M64_CASTS
 135 #elif defined(__SUNPRO_C)
 136 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
 137 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
 138  * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
 139  * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
 140  */
 141 #  define USE_CVT_INTRINSICS
 142 # else
 143 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
 144  * disabled, __m64 is defined as a struct containing "unsigned long long l_".
 145  */
 146 #  define M64_MEMBER l_
 147 # endif
 148 #endif
 149
 150 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS)
 151 typedef uint64_t mmxdatafield;
 152 #else
 153 typedef __m64 mmxdatafield;
 154 #endif
 155
 156 typedef struct
 157 {
 158     mmxdatafield mmx_4x00ff;
 159     mmxdatafield mmx_4x0080;
 160     mmxdatafield mmx_565_rgb;
 161     mmxdatafield mmx_565_unpack_multiplier;
 162     mmxdatafield mmx_565_r;
 163     mmxdatafield mmx_565_g;
 164     mmxdatafield mmx_565_b;
 165     mmxdatafield mmx_mask_0;
 166     mmxdatafield mmx_mask_1;
 167     mmxdatafield mmx_mask_2;
 168     mmxdatafield mmx_mask_3;
 169     mmxdatafield mmx_full_alpha;
 170     mmxdatafield mmx_4x0101;
 171 } mmx_data_t;
 172
 173 #if defined(_MSC_VER)
 174 # define MMXDATA_INIT(field, val) { val ## UI64 }
 175 #elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
 176 # define MMXDATA_INIT(field, val) field =   { val ## ULL }
 177 #else                           /* mmxdatafield is an integral type */
 178 # define MMXDATA_INIT(field, val) field =   val ## ULL
 179 #endif
 180
 181 static const mmx_data_t c =
 182 {
 183     MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
 184     MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
 185     MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
 186     MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
 187     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
 188     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
 189     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
 190     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
 191     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
 192     MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
 193     MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
 194     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
 195     MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
 196 };
 197
 198 #ifdef USE_CVT_INTRINSICS
 199 #    define MC(x) to_m64 (c.mmx_ ## x)
 200 #elif defined(USE_M64_CASTS)
 201 #    define MC(x) ((__m64)c.mmx_ ## x)
 202 #else
 203 #    define MC(x) c.mmx_ ## x
 204 #endif
 205
 206 static force_inline __m64
 207 to_m64 (uint64_t x)
 208 {
 209 #ifdef USE_CVT_INTRINSICS
 210     return _mm_cvtsi64_m64 (x);
 211 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
 212     __m64 res;
 213
 214     res.M64_MEMBER = x;
 215     return res;
 216 #else /* USE_M64_CASTS */
 217     return (__m64)x;
 218 #endif
 219 }
 220
 221 static force_inline uint64_t
 222 to_uint64 (__m64 x)
 223 {
 224 #ifdef USE_CVT_INTRINSICS
 225     return _mm_cvtm64_si64 (x);
 226 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
 227     uint64_t res = x.M64_MEMBER;
 228     return res;
 229 #else /* USE_M64_CASTS */
 230     return (uint64_t)x;
 231 #endif
 232 }
 233
 234 static force_inline __m64
 235 shift (__m64 v,
 236        int   s)
 237 {
 238     if (s > 0)
 239         return _mm_slli_si64 (v, s);
 240     else if (s < 0)
 241         return _mm_srli_si64 (v, -s);
 242     else
 243         return v;
 244 }
 245
 246 static force_inline __m64
 247 negate (__m64 mask)
 248 {
 249     return _mm_xor_si64 (mask, MC (4x00ff));
 250 }
 251
 252 static force_inline __m64
 253 pix_multiply (__m64 a, __m64 b)
 254 {
 255     __m64 res;
 256
 257     res = _mm_mullo_pi16 (a, b);
 258     res = _mm_adds_pu16 (res, MC (4x0080));
 259     res = _mm_mulhi_pu16 (res, MC (4x0101));
 260
 261     return res;
 262 }
 263
 264 static force_inline __m64
 265 pix_add (__m64 a, __m64 b)
 266 {
 267     return _mm_adds_pu8 (a, b);
 268 }
 269
 270 static force_inline __m64
 271 expand_alpha (__m64 pixel)
 272 {
 273     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
 274 }
 275
 276 static force_inline __m64
 277 expand_alpha_rev (__m64 pixel)
 278 {
 279     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
 280 }
 281
 282 static force_inline __m64
 283 invert_colors (__m64 pixel)
 284 {
 285     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
 286 }
 287
 288 static force_inline __m64
 289 over (__m64 src,
 290       __m64 srca,
 291       __m64 dest)
 292 {
 293     return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
 294 }
 295
 296 static force_inline __m64
 297 over_rev_non_pre (__m64 src, __m64 dest)
 298 {
 299     __m64 srca = expand_alpha (src);
 300     __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
 301
 302     return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
 303 }
 304
 305 static force_inline __m64
 306 in (__m64 src, __m64 mask)
 307 {
 308     return pix_multiply (src, mask);
 309 }
 310
 311 #ifndef _MSC_VER
 312 static force_inline __m64
 313 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
 314 {
 315     return over (in (src, mask), pix_multiply (srca, mask), dest);
 316 }
 317
 318 #else
 319
 320 #define in_over(src, srca, mask, dest)                                  \
 321     over (in (src, mask), pix_multiply (srca, mask), dest)
 322
 323 #endif
 324
 325 /* Elemental unaligned loads */
 326
 327 static force_inline __m64 ldq_u(uint64_t *p)
 328 {
 329 #ifdef USE_X86_MMX
 330     /* x86's alignment restrictions are very relaxed. */
 331     return *(__m64 *)p;
 332 #elif defined USE_ARM_IWMMXT
 333     int align = (uintptr_t)p & 7;
 334     __m64 *aligned_p;
 335     if (align == 0)
 336         return *p;
 337     aligned_p = (__m64 *)((uintptr_t)p & ~7);
 338     return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
 339 #else
 340     struct __una_u64 { uint64_t x __attribute__((packed)); };
 341     const struct __una_u64 *ptr = (const struct __una_u64 *) p;
 342     return (__m64) ptr->x;
 343 #endif
 344 }
 345
 346 static force_inline uint32_t ldl_u(const uint32_t *p)
 347 {
 348 #ifdef USE_X86_MMX
 349     /* x86's alignment restrictions are very relaxed. */
 350     return *p;
 351 #else
 352     struct __una_u32 { uint32_t x __attribute__((packed)); };
 353     const struct __una_u32 *ptr = (const struct __una_u32 *) p;
 354     return ptr->x;
 355 #endif
 356 }
 357
 358 static force_inline __m64
 359 load8888 (const uint32_t *v)
 360 {
 361     return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (*v), _mm_setzero_si64 ());
 362 }
 363
 364 static force_inline __m64
 365 load8888u (const uint32_t *v)
 366 {
 367     uint32_t l = ldl_u(v);
 368     return load8888(&l);
 369 }
 370
 371 static force_inline __m64
 372 pack8888 (__m64 lo, __m64 hi)
 373 {
 374     return _mm_packs_pu16 (lo, hi);
 375 }
 376
 377 static force_inline void
 378 store (uint32_t *dest, __m64 v)
 379 {
 380     *dest = _mm_cvtsi64_si32 (v);
 381 }
 382
 383 static force_inline void
 384 store8888 (uint32_t *dest, __m64 v)
 385 {
 386     v = pack8888 (v, _mm_setzero_si64());
 387     store (dest, v);
 388 }
 389
 390 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
 391  *
 392  *    00RR00GG00BB
 393  *
 394  * --- Expanding 565 in the low word ---
 395  *
 396  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
 397  * m = m & (01f0003f001f);
 398  * m = m * (008404100840);
 399  * m = m >> 8;
 400  *
 401  * Note the trick here - the top word is shifted by another nibble to
 402  * avoid it bumping into the middle word
 403  */
 404 static force_inline __m64
 405 expand565 (__m64 pixel, int pos)
 406 {
 407     __m64 p = pixel;
 408     __m64 t1, t2;
 409
 410     /* move pixel to low 16 bit and zero the rest */
 411     p = shift (shift (p, (3 - pos) * 16), -48);
 412
 413     t1 = shift (p, 36 - 11);
 414     t2 = shift (p, 16 - 5);
 415
 416     p = _mm_or_si64 (t1, p);
 417     p = _mm_or_si64 (t2, p);
 418     p = _mm_and_si64 (p, MC (565_rgb));
 419
 420     pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
 421     return _mm_srli_pi16 (pixel, 8);
 422 }
 423
 424 static force_inline __m64
 425 expand8888 (__m64 in, int pos)
 426 {
 427     if (pos == 0)
 428         return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
 429     else
 430         return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
 431 }
 432
 433 static force_inline __m64
 434 expandx888 (__m64 in, int pos)
 435 {
 436     return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
 437 }
 438
 439 static force_inline __m64
 440 pack_565 (__m64 pixel, __m64 target, int pos)
 441 {
 442     __m64 p = pixel;
 443     __m64 t = target;
 444     __m64 r, g, b;
 445
 446     r = _mm_and_si64 (p, MC (565_r));
 447     g = _mm_and_si64 (p, MC (565_g));
 448     b = _mm_and_si64 (p, MC (565_b));
 449
 450     r = shift (r, -(32 - 8) + pos * 16);
 451     g = shift (g, -(16 - 3) + pos * 16);
 452     b = shift (b, -(0  + 3) + pos * 16);
 453
 454     if (pos == 0)
 455         t = _mm_and_si64 (t, MC (mask_0));
 456     else if (pos == 1)
 457         t = _mm_and_si64 (t, MC (mask_1));
 458     else if (pos == 2)
 459         t = _mm_and_si64 (t, MC (mask_2));
 460     else if (pos == 3)
 461         t = _mm_and_si64 (t, MC (mask_3));
 462
 463     p = _mm_or_si64 (r, t);
 464     p = _mm_or_si64 (g, p);
 465
 466     return _mm_or_si64 (b, p);
 467 }
 468
 469 #ifndef _MSC_VER
 470
 471 static force_inline __m64
 472 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 473 {
 474     x = pix_multiply (x, a);
 475     y = pix_multiply (y, b);
 476
 477     return pix_add (x, y);
 478 }
 479
 480 #else
 481
 482 #define pix_add_mul(x, a, y, b)  \
 483     ( x = pix_multiply (x, a),   \
 484       y = pix_multiply (y, b),   \
 485       pix_add (x, y) )
 486
 487 #endif
 488
 489 /* --------------- MMX code patch for fbcompose.c --------------------- */
 490
 491 static force_inline uint32_t
 492 combine (const uint32_t *src, const uint32_t *mask)
 493 {
 494     uint32_t ssrc = *src;
 495
 496     if (mask)
 497     {
 498         __m64 m = load8888 (mask);
 499         __m64 s = load8888 (&ssrc);
 500
 501         m = expand_alpha (m);
 502         s = pix_multiply (s, m);
 503
 504         store8888 (&ssrc, s);
 505     }
 506
 507     return ssrc;
 508 }
 509
 510 static void
 511 mmx_combine_over_u (pixman_implementation_t *imp,
 512                     pixman_op_t              op,
 513                     uint32_t *               dest,
 514                     const uint32_t *         src,
 515                     const uint32_t *         mask,
 516                     int                      width)
 517 {
 518     const uint32_t *end = dest + width;
 519
 520     while (dest < end)
 521     {
 522         uint32_t ssrc = combine (src, mask);
 523         uint32_t a = ssrc >> 24;
 524
 525         if (a == 0xff)
 526         {
 527             *dest = ssrc;
 528         }
 529         else if (ssrc)
 530         {
 531             __m64 s, sa;
 532             s = load8888 (&ssrc);
 533             sa = expand_alpha (s);
 534             store8888 (dest, over (s, sa, load8888 (dest)));
 535         }
 536
 537         ++dest;
 538         ++src;
 539         if (mask)
 540             ++mask;
 541     }
 542     _mm_empty ();
 543 }
 544
 545 static void
 546 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
 547                             pixman_op_t              op,
 548                             uint32_t *               dest,
 549                             const uint32_t *         src,
 550                             const uint32_t *         mask,
 551                             int                      width)
 552 {
 553     const uint32_t *end = dest + width;
 554
 555     while (dest < end)
 556     {
 557         __m64 d, da;
 558         uint32_t s = combine (src, mask);
 559
 560         d = load8888 (dest);
 561         da = expand_alpha (d);
 562         store8888 (dest, over (d, da, load8888 (&s)));
 563
 564         ++dest;
 565         ++src;
 566         if (mask)
 567             mask++;
 568     }
 569     _mm_empty ();
 570 }
 571
 572 static void
 573 mmx_combine_in_u (pixman_implementation_t *imp,
 574                   pixman_op_t              op,
 575                   uint32_t *               dest,
 576                   const uint32_t *         src,
 577                   const uint32_t *         mask,
 578                   int                      width)
 579 {
 580     const uint32_t *end = dest + width;
 581
 582     while (dest < end)
 583     {
 584         __m64 x, a;
 585         uint32_t ssrc = combine (src, mask);
 586
 587         x = load8888 (&ssrc);
 588         a = load8888 (dest);
 589         a = expand_alpha (a);
 590         x = pix_multiply (x, a);
 591
 592         store8888 (dest, x);
 593
 594         ++dest;
 595         ++src;
 596         if (mask)
 597             mask++;
 598     }
 599     _mm_empty ();
 600 }
 601
 602 static void
 603 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
 604                           pixman_op_t              op,
 605                           uint32_t *               dest,
 606                           const uint32_t *         src,
 607                           const uint32_t *         mask,
 608                           int                      width)
 609 {
 610     const uint32_t *end = dest + width;
 611
 612     while (dest < end)
 613     {
 614         __m64 x, a;
 615         uint32_t ssrc = combine (src, mask);
 616
 617         x = load8888 (dest);
 618         a = load8888 (&ssrc);
 619         a = expand_alpha (a);
 620         x = pix_multiply (x, a);
 621         store8888 (dest, x);
 622
 623         ++dest;
 624         ++src;
 625         if (mask)
 626             mask++;
 627     }
 628     _mm_empty ();
 629 }
 630
 631 static void
 632 mmx_combine_out_u (pixman_implementation_t *imp,
 633                    pixman_op_t              op,
 634                    uint32_t *               dest,
 635                    const uint32_t *         src,
 636                    const uint32_t *         mask,
 637                    int                      width)
 638 {
 639     const uint32_t *end = dest + width;
 640
 641     while (dest < end)
 642     {
 643         __m64 x, a;
 644         uint32_t ssrc = combine (src, mask);
 645
 646         x = load8888 (&ssrc);
 647         a = load8888 (dest);
 648         a = expand_alpha (a);
 649         a = negate (a);
 650         x = pix_multiply (x, a);
 651         store8888 (dest, x);
 652
 653         ++dest;
 654         ++src;
 655         if (mask)
 656             mask++;
 657     }
 658     _mm_empty ();
 659 }
 660
 661 static void
 662 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
 663                            pixman_op_t              op,
 664                            uint32_t *               dest,
 665                            const uint32_t *         src,
 666                            const uint32_t *         mask,
 667                            int                      width)
 668 {
 669     const uint32_t *end = dest + width;
 670
 671     while (dest < end)
 672     {
 673         __m64 x, a;
 674         uint32_t ssrc = combine (src, mask);
 675
 676         x = load8888 (dest);
 677         a = load8888 (&ssrc);
 678         a = expand_alpha (a);
 679         a = negate (a);
 680         x = pix_multiply (x, a);
 681
 682         store8888 (dest, x);
 683
 684         ++dest;
 685         ++src;
 686         if (mask)
 687             mask++;
 688     }
 689     _mm_empty ();
 690 }
 691
 692 static void
 693 mmx_combine_atop_u (pixman_implementation_t *imp,
 694                     pixman_op_t              op,
 695                     uint32_t *               dest,
 696                     const uint32_t *         src,
 697                     const uint32_t *         mask,
 698                     int                      width)
 699 {
 700     const uint32_t *end = dest + width;
 701
 702     while (dest < end)
 703     {
 704         __m64 s, da, d, sia;
 705         uint32_t ssrc = combine (src, mask);
 706
 707         s = load8888 (&ssrc);
 708         d = load8888 (dest);
 709         sia = expand_alpha (s);
 710         sia = negate (sia);
 711         da = expand_alpha (d);
 712         s = pix_add_mul (s, da, d, sia);
 713         store8888 (dest, s);
 714
 715         ++dest;
 716         ++src;
 717         if (mask)
 718             mask++;
 719     }
 720     _mm_empty ();
 721 }
 722
 723 static void
 724 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
 725                             pixman_op_t              op,
 726                             uint32_t *               dest,
 727                             const uint32_t *         src,
 728                             const uint32_t *         mask,
 729                             int                      width)
 730 {
 731     const uint32_t *end;
 732
 733     end = dest + width;
 734
 735     while (dest < end)
 736     {
 737         __m64 s, dia, d, sa;
 738         uint32_t ssrc = combine (src, mask);
 739
 740         s = load8888 (&ssrc);
 741         d = load8888 (dest);
 742         sa = expand_alpha (s);
 743         dia = expand_alpha (d);
 744         dia = negate (dia);
 745         s = pix_add_mul (s, dia, d, sa);
 746         store8888 (dest, s);
 747
 748         ++dest;
 749         ++src;
 750         if (mask)
 751             mask++;
 752     }
 753     _mm_empty ();
 754 }
 755
 756 static void
 757 mmx_combine_xor_u (pixman_implementation_t *imp,
 758                    pixman_op_t              op,
 759                    uint32_t *               dest,
 760                    const uint32_t *         src,
 761                    const uint32_t *         mask,
 762                    int                      width)
 763 {
 764     const uint32_t *end = dest + width;
 765
 766     while (dest < end)
 767     {
 768         __m64 s, dia, d, sia;
 769         uint32_t ssrc = combine (src, mask);
 770
 771         s = load8888 (&ssrc);
 772         d = load8888 (dest);
 773         sia = expand_alpha (s);
 774         dia = expand_alpha (d);
 775         sia = negate (sia);
 776         dia = negate (dia);
 777         s = pix_add_mul (s, dia, d, sia);
 778         store8888 (dest, s);
 779
 780         ++dest;
 781         ++src;
 782         if (mask)
 783             mask++;
 784     }
 785     _mm_empty ();
 786 }
 787
 788 static void
 789 mmx_combine_add_u (pixman_implementation_t *imp,
 790                    pixman_op_t              op,
 791                    uint32_t *               dest,
 792                    const uint32_t *         src,
 793                    const uint32_t *         mask,
 794                    int                      width)
 795 {
 796     const uint32_t *end = dest + width;
 797
 798     while (dest < end)
 799     {
 800         __m64 s, d;
 801         uint32_t ssrc = combine (src, mask);
 802
 803         s = load8888 (&ssrc);
 804         d = load8888 (dest);
 805         s = pix_add (s, d);
 806         store8888 (dest, s);
 807
 808         ++dest;
 809         ++src;
 810         if (mask)
 811             mask++;
 812     }
 813     _mm_empty ();
 814 }
 815
 816 static void
 817 mmx_combine_saturate_u (pixman_implementation_t *imp,
 818                         pixman_op_t              op,
 819                         uint32_t *               dest,
 820                         const uint32_t *         src,
 821                         const uint32_t *         mask,
 822                         int                      width)
 823 {
 824     const uint32_t *end = dest + width;
 825
 826     while (dest < end)
 827     {
 828         uint32_t s = combine (src, mask);
 829         uint32_t d = *dest;
 830         __m64 ms = load8888 (&s);
 831         __m64 md = load8888 (&d);
 832         uint32_t sa = s >> 24;
 833         uint32_t da = ~d >> 24;
 834
 835         if (sa > da)
 836         {
 837             uint32_t quot = DIV_UN8 (da, sa) << 24;
 838             __m64 msa = load8888 (&quot);
 839             msa = expand_alpha (msa);
 840             ms = pix_multiply (ms, msa);
 841         }
 842
 843         md = pix_add (md, ms);
 844         store8888 (dest, md);
 845
 846         ++src;
 847         ++dest;
 848         if (mask)
 849             mask++;
 850     }
 851     _mm_empty ();
 852 }
 853
 854 static void
 855 mmx_combine_src_ca (pixman_implementation_t *imp,
 856                     pixman_op_t              op,
 857                     uint32_t *               dest,
 858                     const uint32_t *         src,
 859                     const uint32_t *         mask,
 860                     int                      width)
 861 {
 862     const uint32_t *end = src + width;
 863
 864     while (src < end)
 865     {
 866         __m64 a = load8888 (mask);
 867         __m64 s = load8888 (src);
 868
 869         s = pix_multiply (s, a);
 870         store8888 (dest, s);
 871
 872         ++src;
 873         ++mask;
 874         ++dest;
 875     }
 876     _mm_empty ();
 877 }
 878
 879 static void
 880 mmx_combine_over_ca (pixman_implementation_t *imp,
 881                      pixman_op_t              op,
 882                      uint32_t *               dest,
 883                      const uint32_t *         src,
 884                      const uint32_t *         mask,
 885                      int                      width)
 886 {
 887     const uint32_t *end = src + width;
 888
 889     while (src < end)
 890     {
 891         __m64 a = load8888 (mask);
 892         __m64 s = load8888 (src);
 893         __m64 d = load8888 (dest);
 894         __m64 sa = expand_alpha (s);
 895
 896         store8888 (dest, in_over (s, sa, a, d));
 897
 898         ++src;
 899         ++dest;
 900         ++mask;
 901     }
 902     _mm_empty ();
 903 }
 904
 905 static void
 906 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
 907                              pixman_op_t              op,
 908                              uint32_t *               dest,
 909                              const uint32_t *         src,
 910                              const uint32_t *         mask,
 911                              int                      width)
 912 {
 913     const uint32_t *end = src + width;
 914
 915     while (src < end)
 916     {
 917         __m64 a = load8888 (mask);
 918         __m64 s = load8888 (src);
 919         __m64 d = load8888 (dest);
 920         __m64 da = expand_alpha (d);
 921
 922         store8888 (dest, over (d, da, in (s, a)));
 923
 924         ++src;
 925         ++dest;
 926         ++mask;
 927     }
 928     _mm_empty ();
 929 }
 930
 931 static void
 932 mmx_combine_in_ca (pixman_implementation_t *imp,
 933                    pixman_op_t              op,
 934                    uint32_t *               dest,
 935                    const uint32_t *         src,
 936                    const uint32_t *         mask,
 937                    int                      width)
 938 {
 939     const uint32_t *end = src + width;
 940
 941     while (src < end)
 942     {
 943         __m64 a = load8888 (mask);
 944         __m64 s = load8888 (src);
 945         __m64 d = load8888 (dest);
 946         __m64 da = expand_alpha (d);
 947
 948         s = pix_multiply (s, a);
 949         s = pix_multiply (s, da);
 950         store8888 (dest, s);
 951
 952         ++src;
 953         ++dest;
 954         ++mask;
 955     }
 956     _mm_empty ();
 957 }
 958
 959 static void
 960 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
 961                            pixman_op_t              op,
 962                            uint32_t *               dest,
 963                            const uint32_t *         src,
 964                            const uint32_t *         mask,
 965                            int                      width)
 966 {
 967     const uint32_t *end = src + width;
 968
 969     while (src < end)
 970     {
 971         __m64 a = load8888 (mask);
 972         __m64 s = load8888 (src);
 973         __m64 d = load8888 (dest);
 974         __m64 sa = expand_alpha (s);
 975
 976         a = pix_multiply (a, sa);
 977         d = pix_multiply (d, a);
 978         store8888 (dest, d);
 979
 980         ++src;
 981         ++dest;
 982         ++mask;
 983     }
 984     _mm_empty ();
 985 }
 986
 987 static void
 988 mmx_combine_out_ca (pixman_implementation_t *imp,
 989                     pixman_op_t              op,
 990                     uint32_t *               dest,
 991                     const uint32_t *         src,
 992                     const uint32_t *         mask,
 993                     int                      width)
 994 {
 995     const uint32_t *end = src + width;
 996
 997     while (src < end)
 998     {
 999         __m64 a = load8888 (mask);
1000         __m64 s = load8888 (src);
1001         __m64 d = load8888 (dest);
1002         __m64 da = expand_alpha (d);
1003
1004         da = negate (da);
1005         s = pix_multiply (s, a);
1006         s = pix_multiply (s, da);
1007         store8888 (dest, s);
1008
1009         ++src;
1010         ++dest;
1011         ++mask;
1012     }
1013     _mm_empty ();
1014 }
1015
1016 static void
1017 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1018                             pixman_op_t              op,
1019                             uint32_t *               dest,
1020                             const uint32_t *         src,
1021                             const uint32_t *         mask,
1022                             int                      width)
1023 {
1024     const uint32_t *end = src + width;
1025
1026     while (src < end)
1027     {
1028         __m64 a = load8888 (mask);
1029         __m64 s = load8888 (src);
1030         __m64 d = load8888 (dest);
1031         __m64 sa = expand_alpha (s);
1032
1033         a = pix_multiply (a, sa);
1034         a = negate (a);
1035         d = pix_multiply (d, a);
1036         store8888 (dest, d);
1037
1038         ++src;
1039         ++dest;
1040         ++mask;
1041     }
1042     _mm_empty ();
1043 }
1044
1045 static void
1046 mmx_combine_atop_ca (pixman_implementation_t *imp,
1047                      pixman_op_t              op,
1048                      uint32_t *               dest,
1049                      const uint32_t *         src,
1050                      const uint32_t *         mask,
1051                      int                      width)
1052 {
1053     const uint32_t *end = src + width;
1054
1055     while (src < end)
1056     {
1057         __m64 a = load8888 (mask);
1058         __m64 s = load8888 (src);
1059         __m64 d = load8888 (dest);
1060         __m64 da = expand_alpha (d);
1061         __m64 sa = expand_alpha (s);
1062
1063         s = pix_multiply (s, a);
1064         a = pix_multiply (a, sa);
1065         a = negate (a);
1066         d = pix_add_mul (d, a, s, da);
1067         store8888 (dest, d);
1068
1069         ++src;
1070         ++dest;
1071         ++mask;
1072     }
1073     _mm_empty ();
1074 }
1075
1076 static void
1077 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1078                              pixman_op_t              op,
1079                              uint32_t *               dest,
1080                              const uint32_t *         src,
1081                              const uint32_t *         mask,
1082                              int                      width)
1083 {
1084     const uint32_t *end = src + width;
1085
1086     while (src < end)
1087     {
1088         __m64 a = load8888 (mask);
1089         __m64 s = load8888 (src);
1090         __m64 d = load8888 (dest);
1091         __m64 da = expand_alpha (d);
1092         __m64 sa = expand_alpha (s);
1093
1094         s = pix_multiply (s, a);
1095         a = pix_multiply (a, sa);
1096         da = negate (da);
1097         d = pix_add_mul (d, a, s, da);
1098         store8888 (dest, d);
1099
1100         ++src;
1101         ++dest;
1102         ++mask;
1103     }
1104     _mm_empty ();
1105 }
1106
1107 static void
1108 mmx_combine_xor_ca (pixman_implementation_t *imp,
1109                     pixman_op_t              op,
1110                     uint32_t *               dest,
1111                     const uint32_t *         src,
1112                     const uint32_t *         mask,
1113                     int                      width)
1114 {
1115     const uint32_t *end = src + width;
1116
1117     while (src < end)
1118     {
1119         __m64 a = load8888 (mask);
1120         __m64 s = load8888 (src);
1121         __m64 d = load8888 (dest);
1122         __m64 da = expand_alpha (d);
1123         __m64 sa = expand_alpha (s);
1124
1125         s = pix_multiply (s, a);
1126         a = pix_multiply (a, sa);
1127         da = negate (da);
1128         a = negate (a);
1129         d = pix_add_mul (d, a, s, da);
1130         store8888 (dest, d);
1131
1132         ++src;
1133         ++dest;
1134         ++mask;
1135     }
1136     _mm_empty ();
1137 }
1138
1139 static void
1140 mmx_combine_add_ca (pixman_implementation_t *imp,
1141                     pixman_op_t              op,
1142                     uint32_t *               dest,
1143                     const uint32_t *         src,
1144                     const uint32_t *         mask,
1145                     int                      width)
1146 {
1147     const uint32_t *end = src + width;
1148
1149     while (src < end)
1150     {
1151         __m64 a = load8888 (mask);
1152         __m64 s = load8888 (src);
1153         __m64 d = load8888 (dest);
1154
1155         s = pix_multiply (s, a);
1156         d = pix_add (s, d);
1157         store8888 (dest, d);
1158
1159         ++src;
1160         ++dest;
1161         ++mask;
1162     }
1163     _mm_empty ();
1164 }
1165
1166 /* ------------- MMX code paths called from fbpict.c -------------------- */
1167
1168 static void
1169 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1170                            pixman_composite_info_t *info)
1171 {
1172     PIXMAN_COMPOSITE_ARGS (info);
1173     uint32_t src;
1174     uint32_t    *dst_line, *dst;
1175     int32_t w;
1176     int dst_stride;
1177     __m64 vsrc, vsrca;
1178
1179     CHECKPOINT ();
1180
1181     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1182
1183     if (src == 0)
1184         return;
1185
1186     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1187
1188     vsrc = load8888 (&src);
1189     vsrca = expand_alpha (vsrc);
1190
1191     while (height--)
1192     {
1193         dst = dst_line;
1194         dst_line += dst_stride;
1195         w = width;
1196
1197         CHECKPOINT ();
1198
1199         while (w && (unsigned long)dst & 7)
1200         {
1201             store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1202
1203             w--;
1204             dst++;
1205         }
1206
1207         while (w >= 2)
1208         {
1209             __m64 vdest;
1210             __m64 dest0, dest1;
1211
1212             vdest = *(__m64 *)dst;
1213
1214             dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1215             dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1216
1217             *(__m64 *)dst = pack8888 (dest0, dest1);
1218
1219             dst += 2;
1220             w -= 2;
1221         }
1222
1223         CHECKPOINT ();
1224
1225         if (w)
1226         {
1227             store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1228         }
1229     }
1230
1231     _mm_empty ();
1232 }
1233
1234 static void
1235 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1236                            pixman_composite_info_t *info)
1237 {
1238     PIXMAN_COMPOSITE_ARGS (info);
1239     uint32_t src;
1240     uint16_t    *dst_line, *dst;
1241     int32_t w;
1242     int dst_stride;
1243     __m64 vsrc, vsrca;
1244
1245     CHECKPOINT ();
1246
1247     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1248
1249     if (src == 0)
1250         return;
1251
1252     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1253
1254     vsrc = load8888 (&src);
1255     vsrca = expand_alpha (vsrc);
1256
1257     while (height--)
1258     {
1259         dst = dst_line;
1260         dst_line += dst_stride;
1261         w = width;
1262
1263         CHECKPOINT ();
1264
1265         while (w && (unsigned long)dst & 7)
1266         {
1267             uint64_t d = *dst;
1268             __m64 vdest = expand565 (to_m64 (d), 0);
1269
1270             vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1271             *dst = to_uint64 (vdest);
1272
1273             w--;
1274             dst++;
1275         }
1276
1277         while (w >= 4)
1278         {
1279             __m64 vdest;
1280
1281             vdest = *(__m64 *)dst;
1282
1283             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
1284             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
1285             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
1286             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
1287
1288             *(__m64 *)dst = vdest;
1289
1290             dst += 4;
1291             w -= 4;
1292         }
1293
1294         CHECKPOINT ();
1295
1296         while (w)
1297         {
1298             uint64_t d = *dst;
1299             __m64 vdest = expand565 (to_m64 (d), 0);
1300
1301             vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1302             *dst = to_uint64 (vdest);
1303
1304             w--;
1305             dst++;
1306         }
1307     }
1308
1309     _mm_empty ();
1310 }
1311
1312 static void
1313 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1314                                    pixman_composite_info_t *info)
1315 {
1316     PIXMAN_COMPOSITE_ARGS (info);
1317     uint32_t src;
1318     uint32_t    *dst_line;
1319     uint32_t    *mask_line;
1320     int dst_stride, mask_stride;
1321     __m64 vsrc, vsrca;
1322
1323     CHECKPOINT ();
1324
1325     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1326
1327     if (src == 0)
1328         return;
1329
1330     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1331     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1332
1333     vsrc = load8888 (&src);
1334     vsrca = expand_alpha (vsrc);
1335
1336     while (height--)
1337     {
1338         int twidth = width;
1339         uint32_t *p = (uint32_t *)mask_line;
1340         uint32_t *q = (uint32_t *)dst_line;
1341
1342         while (twidth && (unsigned long)q & 7)
1343         {
1344             uint32_t m = *(uint32_t *)p;
1345
1346             if (m)
1347             {
1348                 __m64 vdest = load8888 (q);
1349                 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1350                 store8888 (q, vdest);
1351             }
1352
1353             twidth--;
1354             p++;
1355             q++;
1356         }
1357
1358         while (twidth >= 2)
1359         {
1360             uint32_t m0, m1;
1361             m0 = *p;
1362             m1 = *(p + 1);
1363
1364             if (m0 | m1)
1365             {
1366                 __m64 dest0, dest1;
1367                 __m64 vdest = *(__m64 *)q;
1368
1369                 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1370                                  expand8888 (vdest, 0));
1371                 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1372                                  expand8888 (vdest, 1));
1373
1374                 *(__m64 *)q = pack8888 (dest0, dest1);
1375             }
1376
1377             p += 2;
1378             q += 2;
1379             twidth -= 2;
1380         }
1381
1382         if (twidth)
1383         {
1384             uint32_t m = *(uint32_t *)p;
1385
1386             if (m)
1387             {
1388                 __m64 vdest = load8888 (q);
1389                 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1390                 store8888 (q, vdest);
1391             }
1392
1393             twidth--;
1394             p++;
1395             q++;
1396         }
1397
1398         dst_line += dst_stride;
1399         mask_line += mask_stride;
1400     }
1401
1402     _mm_empty ();
1403 }
1404
1405 static void
1406 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1407                                 pixman_composite_info_t *info)
1408 {
1409     PIXMAN_COMPOSITE_ARGS (info);
1410     uint32_t    *dst_line, *dst;
1411     uint32_t    *src_line, *src;
1412     uint32_t mask;
1413     __m64 vmask;
1414     int dst_stride, src_stride;
1415     int32_t w;
1416
1417     CHECKPOINT ();
1418
1419     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1420     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1421
1422     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1423     mask &= 0xff000000;
1424     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1425     vmask = load8888 (&mask);
1426
1427     while (height--)
1428     {
1429         dst = dst_line;
1430         dst_line += dst_stride;
1431         src = src_line;
1432         src_line += src_stride;
1433         w = width;
1434
1435         while (w && (unsigned long)dst & 7)
1436         {
1437             __m64 s = load8888 (src);
1438             __m64 d = load8888 (dst);
1439
1440             store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1441
1442             w--;
1443             dst++;
1444             src++;
1445         }
1446
1447         while (w >= 2)
1448         {
1449             __m64 vs = ldq_u((uint64_t *)src);
1450             __m64 vd = *(__m64 *)dst;
1451             __m64 vsrc0 = expand8888 (vs, 0);
1452             __m64 vsrc1 = expand8888 (vs, 1);
1453
1454             *(__m64 *)dst = pack8888 (
1455                 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1456                 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1457
1458             w -= 2;
1459             dst += 2;
1460             src += 2;
1461         }
1462
1463         if (w)
1464         {
1465             __m64 s = load8888 (src);
1466             __m64 d = load8888 (dst);
1467
1468             store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1469         }
1470     }
1471
1472     _mm_empty ();
1473 }
1474
1475 static void
1476 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1477                                 pixman_composite_info_t *info)
1478 {
1479     PIXMAN_COMPOSITE_ARGS (info);
1480     uint32_t *dst_line, *dst;
1481     uint32_t *src_line, *src;
1482     uint32_t mask;
1483     __m64 vmask;
1484     int dst_stride, src_stride;
1485     int32_t w;
1486     __m64 srca;
1487
1488     CHECKPOINT ();
1489
1490     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1491     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1492     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1493
1494     mask &= 0xff000000;
1495     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1496     vmask = load8888 (&mask);
1497     srca = MC (4x00ff);
1498
1499     while (height--)
1500     {
1501         dst = dst_line;
1502         dst_line += dst_stride;
1503         src = src_line;
1504         src_line += src_stride;
1505         w = width;
1506
1507         while (w && (unsigned long)dst & 7)
1508         {
1509             uint32_t ssrc = *src | 0xff000000;
1510             __m64 s = load8888 (&ssrc);
1511             __m64 d = load8888 (dst);
1512
1513             store8888 (dst, in_over (s, srca, vmask, d));
1514
1515             w--;
1516             dst++;
1517             src++;
1518         }
1519
1520         while (w >= 16)
1521         {
1522             __m64 vd0 = *(__m64 *)(dst + 0);
1523             __m64 vd1 = *(__m64 *)(dst + 2);
1524             __m64 vd2 = *(__m64 *)(dst + 4);
1525             __m64 vd3 = *(__m64 *)(dst + 6);
1526             __m64 vd4 = *(__m64 *)(dst + 8);
1527             __m64 vd5 = *(__m64 *)(dst + 10);
1528             __m64 vd6 = *(__m64 *)(dst + 12);
1529             __m64 vd7 = *(__m64 *)(dst + 14);
1530
1531             __m64 vs0 = ldq_u((uint64_t *)(src + 0));
1532             __m64 vs1 = ldq_u((uint64_t *)(src + 2));
1533             __m64 vs2 = ldq_u((uint64_t *)(src + 4));
1534             __m64 vs3 = ldq_u((uint64_t *)(src + 6));
1535             __m64 vs4 = ldq_u((uint64_t *)(src + 8));
1536             __m64 vs5 = ldq_u((uint64_t *)(src + 10));
1537             __m64 vs6 = ldq_u((uint64_t *)(src + 12));
1538             __m64 vs7 = ldq_u((uint64_t *)(src + 14));
1539
1540             vd0 = pack8888 (
1541                 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1542                 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1543
1544             vd1 = pack8888 (
1545                 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1546                 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1547
1548             vd2 = pack8888 (
1549                 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1550                 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1551
1552             vd3 = pack8888 (
1553                 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1554                 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1555
1556             vd4 = pack8888 (
1557                 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1558                 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1559
1560             vd5 = pack8888 (
1561                 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1562                 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1563
1564             vd6 = pack8888 (
1565                 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1566                 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1567
1568             vd7 = pack8888 (
1569                 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1570                 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1571
1572             *(__m64 *)(dst + 0) = vd0;
1573             *(__m64 *)(dst + 2) = vd1;
1574             *(__m64 *)(dst + 4) = vd2;
1575             *(__m64 *)(dst + 6) = vd3;
1576             *(__m64 *)(dst + 8) = vd4;
1577             *(__m64 *)(dst + 10) = vd5;
1578             *(__m64 *)(dst + 12) = vd6;
1579             *(__m64 *)(dst + 14) = vd7;
1580
1581             w -= 16;
1582             dst += 16;
1583             src += 16;
1584         }
1585
1586         while (w)
1587         {
1588             uint32_t ssrc = *src | 0xff000000;
1589             __m64 s = load8888 (&ssrc);
1590             __m64 d = load8888 (dst);
1591
1592             store8888 (dst, in_over (s, srca, vmask, d));
1593
1594             w--;
1595             dst++;
1596             src++;
1597         }
1598     }
1599
1600     _mm_empty ();
1601 }
1602
1603 static void
1604 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1605                               pixman_composite_info_t *info)
1606 {
1607     PIXMAN_COMPOSITE_ARGS (info);
1608     uint32_t *dst_line, *dst;
1609     uint32_t *src_line, *src;
1610     uint32_t s;
1611     int dst_stride, src_stride;
1612     uint8_t a;
1613     int32_t w;
1614
1615     CHECKPOINT ();
1616
1617     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1618     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1619
1620     while (height--)
1621     {
1622         dst = dst_line;
1623         dst_line += dst_stride;
1624         src = src_line;
1625         src_line += src_stride;
1626         w = width;
1627
1628         while (w--)
1629         {
1630             s = *src++;
1631             a = s >> 24;
1632
1633             if (a == 0xff)
1634             {
1635                 *dst = s;
1636             }
1637             else if (s)
1638             {
1639                 __m64 ms, sa;
1640                 ms = load8888 (&s);
1641                 sa = expand_alpha (ms);
1642                 store8888 (dst, over (ms, sa, load8888 (dst)));
1643             }
1644
1645             dst++;
1646         }
1647     }
1648     _mm_empty ();
1649 }
1650
1651 static void
1652 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1653                               pixman_composite_info_t *info)
1654 {
1655     PIXMAN_COMPOSITE_ARGS (info);
1656     uint16_t    *dst_line, *dst;
1657     uint32_t    *src_line, *src;
1658     int dst_stride, src_stride;
1659     int32_t w;
1660
1661     CHECKPOINT ();
1662
1663     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1664     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1665
1666 #if 0
1667     /* FIXME */
1668     assert (src_image->drawable == mask_image->drawable);
1669 #endif
1670
1671     while (height--)
1672     {
1673         dst = dst_line;
1674         dst_line += dst_stride;
1675         src = src_line;
1676         src_line += src_stride;
1677         w = width;
1678
1679         CHECKPOINT ();
1680
1681         while (w && (unsigned long)dst & 7)
1682         {
1683             __m64 vsrc = load8888 (src);
1684             uint64_t d = *dst;
1685             __m64 vdest = expand565 (to_m64 (d), 0);
1686
1687             vdest = pack_565 (
1688                 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1689
1690             *dst = to_uint64 (vdest);
1691
1692             w--;
1693             dst++;
1694             src++;
1695         }
1696
1697         CHECKPOINT ();
1698
1699         while (w >= 4)
1700         {
1701             __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1702             __m64 vdest;
1703
1704             vsrc0 = load8888 ((src + 0));
1705             vsrc1 = load8888 ((src + 1));
1706             vsrc2 = load8888 ((src + 2));
1707             vsrc3 = load8888 ((src + 3));
1708
1709             vdest = *(__m64 *)dst;
1710
1711             vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
1712             vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
1713             vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
1714             vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
1715
1716             *(__m64 *)dst = vdest;
1717
1718             w -= 4;
1719             dst += 4;
1720             src += 4;
1721         }
1722
1723         CHECKPOINT ();
1724
1725         while (w)
1726         {
1727             __m64 vsrc = load8888 (src);
1728             uint64_t d = *dst;
1729             __m64 vdest = expand565 (to_m64 (d), 0);
1730
1731             vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1732
1733             *dst = to_uint64 (vdest);
1734
1735             w--;
1736             dst++;
1737             src++;
1738         }
1739     }
1740
1741     _mm_empty ();
1742 }
1743
1744 static void
1745 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1746                              pixman_composite_info_t *info)
1747 {
1748     PIXMAN_COMPOSITE_ARGS (info);
1749     uint32_t src, srca;
1750     uint32_t *dst_line, *dst;
1751     uint8_t *mask_line, *mask;
1752     int dst_stride, mask_stride;
1753     int32_t w;
1754     __m64 vsrc, vsrca;
1755     uint64_t srcsrc;
1756
1757     CHECKPOINT ();
1758
1759     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1760
1761     srca = src >> 24;
1762     if (src == 0)
1763         return;
1764
1765     srcsrc = (uint64_t)src << 32 | src;
1766
1767     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1768     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1769
1770     vsrc = load8888 (&src);
1771     vsrca = expand_alpha (vsrc);
1772
1773     while (height--)
1774     {
1775         dst = dst_line;
1776         dst_line += dst_stride;
1777         mask = mask_line;
1778         mask_line += mask_stride;
1779         w = width;
1780
1781         CHECKPOINT ();
1782
1783         while (w && (unsigned long)dst & 7)
1784         {
1785             uint64_t m = *mask;
1786
1787             if (m)
1788             {
1789                 __m64 vdest = in_over (vsrc, vsrca,
1790                                        expand_alpha_rev (to_m64 (m)),
1791                                        load8888 (dst));
1792
1793                 store8888 (dst, vdest);
1794             }
1795
1796             w--;
1797             mask++;
1798             dst++;
1799         }
1800
1801         CHECKPOINT ();
1802
1803         while (w >= 2)
1804         {
1805             uint64_t m0, m1;
1806
1807             m0 = *mask;
1808             m1 = *(mask + 1);
1809
1810             if (srca == 0xff && (m0 & m1) == 0xff)
1811             {
1812                 *(uint64_t *)dst = srcsrc;
1813             }
1814             else if (m0 | m1)
1815             {
1816                 __m64 vdest;
1817                 __m64 dest0, dest1;
1818
1819                 vdest = *(__m64 *)dst;
1820
1821                 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
1822                                  expand8888 (vdest, 0));
1823                 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
1824                                  expand8888 (vdest, 1));
1825
1826                 *(__m64 *)dst = pack8888 (dest0, dest1);
1827             }
1828
1829             mask += 2;
1830             dst += 2;
1831             w -= 2;
1832         }
1833
1834         CHECKPOINT ();
1835
1836         if (w)
1837         {
1838             uint64_t m = *mask;
1839
1840             if (m)
1841             {
1842                 __m64 vdest = load8888 (dst);
1843
1844                 vdest = in_over (
1845                     vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
1846                 store8888 (dst, vdest);
1847             }
1848         }
1849     }
1850
1851     _mm_empty ();
1852 }
1853
1854 pixman_bool_t
1855 pixman_fill_mmx (uint32_t *bits,
1856                  int       stride,
1857                  int       bpp,
1858                  int       x,
1859                  int       y,
1860                  int       width,
1861                  int       height,
1862                  uint32_t xor)
1863 {
1864     uint64_t fill;
1865     __m64 vfill;
1866     uint32_t byte_width;
1867     uint8_t     *byte_line;
1868
1869 #if defined __GNUC__ && defined USE_X86_MMX
1870     __m64 v1, v2, v3, v4, v5, v6, v7;
1871 #endif
1872
1873     if (bpp != 16 && bpp != 32 && bpp != 8)
1874         return FALSE;
1875
1876     if (bpp == 8)
1877     {
1878         stride = stride * (int) sizeof (uint32_t) / 1;
1879         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
1880         byte_width = width;
1881         stride *= 1;
1882         xor = (xor & 0xff) * 0x01010101;
1883     }
1884     else if (bpp == 16)
1885     {
1886         stride = stride * (int) sizeof (uint32_t) / 2;
1887         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
1888         byte_width = 2 * width;
1889         stride *= 2;
1890         xor = (xor & 0xffff) * 0x00010001;
1891     }
1892     else
1893     {
1894         stride = stride * (int) sizeof (uint32_t) / 4;
1895         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
1896         byte_width = 4 * width;
1897         stride *= 4;
1898     }
1899
1900     fill = ((uint64_t)xor << 32) | xor;
1901     vfill = to_m64 (fill);
1902
1903 #if defined __GNUC__ && defined USE_X86_MMX
1904     __asm__ (
1905         "movq           %7,     %0\n"
1906         "movq           %7,     %1\n"
1907         "movq           %7,     %2\n"
1908         "movq           %7,     %3\n"
1909         "movq           %7,     %4\n"
1910         "movq           %7,     %5\n"
1911         "movq           %7,     %6\n"
1912         : "=&y" (v1), "=&y" (v2), "=&y" (v3),
1913           "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
1914         : "y" (vfill));
1915 #endif
1916
1917     while (height--)
1918     {
1919         int w;
1920         uint8_t *d = byte_line;
1921
1922         byte_line += stride;
1923         w = byte_width;
1924
1925         if (w >= 1 && ((unsigned long)d & 1))
1926         {
1927             *(uint8_t *)d = (xor & 0xff);
1928             w--;
1929             d++;
1930         }
1931
1932         if (w >= 2 && ((unsigned long)d & 3))
1933         {
1934             *(uint16_t *)d = xor;
1935             w -= 2;
1936             d += 2;
1937         }
1938
1939         while (w >= 4 && ((unsigned long)d & 7))
1940         {
1941             *(uint32_t *)d = xor;
1942
1943             w -= 4;
1944             d += 4;
1945         }
1946
1947         while (w >= 64)
1948         {
1949 #if defined __GNUC__ && defined USE_X86_MMX
1950             __asm__ (
1951                 "movq   %1,       (%0)\n"
1952                 "movq   %2,      8(%0)\n"
1953                 "movq   %3,     16(%0)\n"
1954                 "movq   %4,     24(%0)\n"
1955                 "movq   %5,     32(%0)\n"
1956                 "movq   %6,     40(%0)\n"
1957                 "movq   %7,     48(%0)\n"
1958                 "movq   %8,     56(%0)\n"
1959                 :
1960                 : "r" (d),
1961                   "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
1962                   "y" (v4), "y" (v5), "y" (v6), "y" (v7)
1963                 : "memory");
1964 #else
1965             *(__m64*) (d +  0) = vfill;
1966             *(__m64*) (d +  8) = vfill;
1967             *(__m64*) (d + 16) = vfill;
1968             *(__m64*) (d + 24) = vfill;
1969             *(__m64*) (d + 32) = vfill;
1970             *(__m64*) (d + 40) = vfill;
1971             *(__m64*) (d + 48) = vfill;
1972             *(__m64*) (d + 56) = vfill;
1973 #endif
1974             w -= 64;
1975             d += 64;
1976         }
1977
1978         while (w >= 4)
1979         {
1980             *(uint32_t *)d = xor;
1981
1982             w -= 4;
1983             d += 4;
1984         }
1985         if (w >= 2)
1986         {
1987             *(uint16_t *)d = xor;
1988             w -= 2;
1989             d += 2;
1990         }
1991         if (w >= 1)
1992         {
1993             *(uint8_t *)d = (xor & 0xff);
1994             w--;
1995             d++;
1996         }
1997
1998     }
1999
2000     _mm_empty ();
2001     return TRUE;
2002 }
2003
2004 static void
2005 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2006                             pixman_composite_info_t *info)
2007 {
2008     PIXMAN_COMPOSITE_ARGS (info);
2009     uint32_t src, srca;
2010     uint32_t    *dst_line, *dst;
2011     uint8_t     *mask_line, *mask;
2012     int dst_stride, mask_stride;
2013     int32_t w;
2014     __m64 vsrc;
2015     uint64_t srcsrc;
2016
2017     CHECKPOINT ();
2018
2019     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2020
2021     srca = src >> 24;
2022     if (src == 0)
2023     {
2024         pixman_fill_mmx (dest_image->bits.bits, dest_image->bits.rowstride,
2025                          PIXMAN_FORMAT_BPP (dest_image->bits.format),
2026                          dest_x, dest_y, width, height, 0);
2027         return;
2028     }
2029
2030     srcsrc = (uint64_t)src << 32 | src;
2031
2032     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2033     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2034
2035     vsrc = load8888 (&src);
2036
2037     while (height--)
2038     {
2039         dst = dst_line;
2040         dst_line += dst_stride;
2041         mask = mask_line;
2042         mask_line += mask_stride;
2043         w = width;
2044
2045         CHECKPOINT ();
2046
2047         while (w && (unsigned long)dst & 7)
2048         {
2049             uint64_t m = *mask;
2050
2051             if (m)
2052             {
2053                 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2054
2055                 store8888 (dst, vdest);
2056             }
2057             else
2058             {
2059                 *dst = 0;
2060             }
2061
2062             w--;
2063             mask++;
2064             dst++;
2065         }
2066
2067         CHECKPOINT ();
2068
2069         while (w >= 2)
2070         {
2071             uint64_t m0, m1;
2072             m0 = *mask;
2073             m1 = *(mask + 1);
2074
2075             if (srca == 0xff && (m0 & m1) == 0xff)
2076             {
2077                 *(uint64_t *)dst = srcsrc;
2078             }
2079             else if (m0 | m1)
2080             {
2081                 __m64 dest0, dest1;
2082
2083                 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2084                 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2085
2086                 *(__m64 *)dst = pack8888 (dest0, dest1);
2087             }
2088             else
2089             {
2090                 *(uint64_t *)dst = 0;
2091             }
2092
2093             mask += 2;
2094             dst += 2;
2095             w -= 2;
2096         }
2097
2098         CHECKPOINT ();
2099
2100         if (w)
2101         {
2102             uint64_t m = *mask;
2103
2104             if (m)
2105             {
2106                 __m64 vdest = load8888 (dst);
2107
2108                 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2109                 store8888 (dst, vdest);
2110             }
2111             else
2112             {
2113                 *dst = 0;
2114             }
2115         }
2116     }
2117
2118     _mm_empty ();
2119 }
2120
2121 static void
2122 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2123                              pixman_composite_info_t *info)
2124 {
2125     PIXMAN_COMPOSITE_ARGS (info);
2126     uint32_t src, srca;
2127     uint16_t *dst_line, *dst;
2128     uint8_t *mask_line, *mask;
2129     int dst_stride, mask_stride;
2130     int32_t w;
2131     __m64 vsrc, vsrca, tmp;
2132     uint64_t srcsrcsrcsrc, src16;
2133
2134     CHECKPOINT ();
2135
2136     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2137
2138     srca = src >> 24;
2139     if (src == 0)
2140         return;
2141
2142     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2143     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2144
2145     vsrc = load8888 (&src);
2146     vsrca = expand_alpha (vsrc);
2147
2148     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2149     src16 = to_uint64 (tmp);
2150
2151     srcsrcsrcsrc =
2152         (uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
2153         (uint64_t)src16 << 16 | (uint64_t)src16;
2154
2155     while (height--)
2156     {
2157         dst = dst_line;
2158         dst_line += dst_stride;
2159         mask = mask_line;
2160         mask_line += mask_stride;
2161         w = width;
2162
2163         CHECKPOINT ();
2164
2165         while (w && (unsigned long)dst & 7)
2166         {
2167             uint64_t m = *mask;
2168
2169             if (m)
2170             {
2171                 uint64_t d = *dst;
2172                 __m64 vd = to_m64 (d);
2173                 __m64 vdest = in_over (
2174                     vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2175
2176                 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2177                 *dst = to_uint64 (vd);
2178             }
2179
2180             w--;
2181             mask++;
2182             dst++;
2183         }
2184
2185         CHECKPOINT ();
2186
2187         while (w >= 4)
2188         {
2189             uint64_t m0, m1, m2, m3;
2190             m0 = *mask;
2191             m1 = *(mask + 1);
2192             m2 = *(mask + 2);
2193             m3 = *(mask + 3);
2194
2195             if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2196             {
2197                 *(uint64_t *)dst = srcsrcsrcsrc;
2198             }
2199             else if (m0 | m1 | m2 | m3)
2200             {
2201                 __m64 vdest;
2202                 __m64 vm0, vm1, vm2, vm3;
2203
2204                 vdest = *(__m64 *)dst;
2205
2206                 vm0 = to_m64 (m0);
2207                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
2208                                            expand565 (vdest, 0)), vdest, 0);
2209                 vm1 = to_m64 (m1);
2210                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
2211                                            expand565 (vdest, 1)), vdest, 1);
2212                 vm2 = to_m64 (m2);
2213                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
2214                                            expand565 (vdest, 2)), vdest, 2);
2215                 vm3 = to_m64 (m3);
2216                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
2217                                            expand565 (vdest, 3)), vdest, 3);
2218
2219                 *(__m64 *)dst = vdest;
2220             }
2221
2222             w -= 4;
2223             mask += 4;
2224             dst += 4;
2225         }
2226
2227         CHECKPOINT ();
2228
2229         while (w)
2230         {
2231             uint64_t m = *mask;
2232
2233             if (m)
2234             {
2235                 uint64_t d = *dst;
2236                 __m64 vd = to_m64 (d);
2237                 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2238                                        expand565 (vd, 0));
2239                 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2240                 *dst = to_uint64 (vd);
2241             }
2242
2243             w--;
2244             mask++;
2245             dst++;
2246         }
2247     }
2248
2249     _mm_empty ();
2250 }
2251
2252 static void
2253 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2254                                 pixman_composite_info_t *info)
2255 {
2256     PIXMAN_COMPOSITE_ARGS (info);
2257     uint16_t    *dst_line, *dst;
2258     uint32_t    *src_line, *src;
2259     int dst_stride, src_stride;
2260     int32_t w;
2261
2262     CHECKPOINT ();
2263
2264     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2265     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2266
2267 #if 0
2268     /* FIXME */
2269     assert (src_image->drawable == mask_image->drawable);
2270 #endif
2271
2272     while (height--)
2273     {
2274         dst = dst_line;
2275         dst_line += dst_stride;
2276         src = src_line;
2277         src_line += src_stride;
2278         w = width;
2279
2280         CHECKPOINT ();
2281
2282         while (w && (unsigned long)dst & 7)
2283         {
2284             __m64 vsrc = load8888 (src);
2285             uint64_t d = *dst;
2286             __m64 vdest = expand565 (to_m64 (d), 0);
2287
2288             vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2289
2290             *dst = to_uint64 (vdest);
2291
2292             w--;
2293             dst++;
2294             src++;
2295         }
2296
2297         CHECKPOINT ();
2298
2299         while (w >= 4)
2300         {
2301             uint32_t s0, s1, s2, s3;
2302             unsigned char a0, a1, a2, a3;
2303
2304             s0 = *src;
2305             s1 = *(src + 1);
2306             s2 = *(src + 2);
2307             s3 = *(src + 3);
2308
2309             a0 = (s0 >> 24);
2310             a1 = (s1 >> 24);
2311             a2 = (s2 >> 24);
2312             a3 = (s3 >> 24);
2313
2314             if ((a0 & a1 & a2 & a3) == 0xFF)
2315             {
2316                 __m64 vdest;
2317                 vdest = pack_565 (invert_colors (load8888 (&s0)), _mm_setzero_si64 (), 0);
2318                 vdest = pack_565 (invert_colors (load8888 (&s1)), vdest, 1);
2319                 vdest = pack_565 (invert_colors (load8888 (&s2)), vdest, 2);
2320                 vdest = pack_565 (invert_colors (load8888 (&s3)), vdest, 3);
2321
2322                 *(__m64 *)dst = vdest;
2323             }
2324             else if (s0 | s1 | s2 | s3)
2325             {
2326                 __m64 vdest = *(__m64 *)dst;
2327
2328                 vdest = pack_565 (over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)), vdest, 0);
2329                 vdest = pack_565 (over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)), vdest, 1);
2330                 vdest = pack_565 (over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)), vdest, 2);
2331                 vdest = pack_565 (over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)), vdest, 3);
2332
2333                 *(__m64 *)dst = vdest;
2334             }
2335
2336             w -= 4;
2337             dst += 4;
2338             src += 4;
2339         }
2340
2341         CHECKPOINT ();
2342
2343         while (w)
2344         {
2345             __m64 vsrc = load8888 (src);
2346             uint64_t d = *dst;
2347             __m64 vdest = expand565 (to_m64 (d), 0);
2348
2349             vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2350
2351             *dst = to_uint64 (vdest);
2352
2353             w--;
2354             dst++;
2355             src++;
2356         }
2357     }
2358
2359     _mm_empty ();
2360 }
2361
2362 static void
2363 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2364                                 pixman_composite_info_t *info)
2365 {
2366     PIXMAN_COMPOSITE_ARGS (info);
2367     uint32_t    *dst_line, *dst;
2368     uint32_t    *src_line, *src;
2369     int dst_stride, src_stride;
2370     int32_t w;
2371
2372     CHECKPOINT ();
2373
2374     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2375     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2376
2377 #if 0
2378     /* FIXME */
2379     assert (src_image->drawable == mask_image->drawable);
2380 #endif
2381
2382     while (height--)
2383     {
2384         dst = dst_line;
2385         dst_line += dst_stride;
2386         src = src_line;
2387         src_line += src_stride;
2388         w = width;
2389
2390         while (w && (unsigned long)dst & 7)
2391         {
2392             __m64 s = load8888 (src);
2393             __m64 d = load8888 (dst);
2394
2395             store8888 (dst, over_rev_non_pre (s, d));
2396
2397             w--;
2398             dst++;
2399             src++;
2400         }
2401
2402         while (w >= 2)
2403         {
2404             uint32_t s0, s1;
2405             unsigned char a0, a1;
2406             __m64 d0, d1;
2407
2408             s0 = *src;
2409             s1 = *(src + 1);
2410
2411             a0 = (s0 >> 24);
2412             a1 = (s1 >> 24);
2413
2414             if ((a0 & a1) == 0xFF)
2415             {
2416                 d0 = invert_colors (load8888 (&s0));
2417                 d1 = invert_colors (load8888 (&s1));
2418
2419                 *(__m64 *)dst = pack8888 (d0, d1);
2420             }
2421             else if (s0 | s1)
2422             {
2423                 __m64 vdest = *(__m64 *)dst;
2424
2425                 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2426                 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2427
2428                 *(__m64 *)dst = pack8888 (d0, d1);
2429             }
2430
2431             w -= 2;
2432             dst += 2;
2433             src += 2;
2434         }
2435
2436         if (w)
2437         {
2438             __m64 s = load8888 (src);
2439             __m64 d = load8888 (dst);
2440
2441             store8888 (dst, over_rev_non_pre (s, d));
2442         }
2443     }
2444
2445     _mm_empty ();
2446 }
2447
2448 static void
2449 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2450                                    pixman_composite_info_t *info)
2451 {
2452     PIXMAN_COMPOSITE_ARGS (info);
2453     uint32_t src;
2454     uint16_t    *dst_line;
2455     uint32_t    *mask_line;
2456     int dst_stride, mask_stride;
2457     __m64 vsrc, vsrca;
2458
2459     CHECKPOINT ();
2460
2461     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2462
2463     if (src == 0)
2464         return;
2465
2466     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2467     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2468
2469     vsrc = load8888 (&src);
2470     vsrca = expand_alpha (vsrc);
2471
2472     while (height--)
2473     {
2474         int twidth = width;
2475         uint32_t *p = (uint32_t *)mask_line;
2476         uint16_t *q = (uint16_t *)dst_line;
2477
2478         while (twidth && ((unsigned long)q & 7))
2479         {
2480             uint32_t m = *(uint32_t *)p;
2481
2482             if (m)
2483             {
2484                 uint64_t d = *q;
2485                 __m64 vdest = expand565 (to_m64 (d), 0);
2486                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2487                 *q = to_uint64 (vdest);
2488             }
2489
2490             twidth--;
2491             p++;
2492             q++;
2493         }
2494
2495         while (twidth >= 4)
2496         {
2497             uint32_t m0, m1, m2, m3;
2498
2499             m0 = *p;
2500             m1 = *(p + 1);
2501             m2 = *(p + 2);
2502             m3 = *(p + 3);
2503
2504             if ((m0 | m1 | m2 | m3))
2505             {
2506                 __m64 vdest = *(__m64 *)q;
2507
2508                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)), vdest, 0);
2509                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)), vdest, 1);
2510                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)), vdest, 2);
2511                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)), vdest, 3);
2512
2513                 *(__m64 *)q = vdest;
2514             }
2515             twidth -= 4;
2516             p += 4;
2517             q += 4;
2518         }
2519
2520         while (twidth)
2521         {
2522             uint32_t m;
2523
2524             m = *(uint32_t *)p;
2525             if (m)
2526             {
2527                 uint64_t d = *q;
2528                 __m64 vdest = expand565 (to_m64 (d), 0);
2529                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2530                 *q = to_uint64 (vdest);
2531             }
2532
2533             twidth--;
2534             p++;
2535             q++;
2536         }
2537
2538         mask_line += mask_stride;
2539         dst_line += dst_stride;
2540     }
2541
2542     _mm_empty ();
2543 }
2544
2545 static void
2546 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2547                         pixman_composite_info_t *info)
2548 {
2549     PIXMAN_COMPOSITE_ARGS (info);
2550     uint8_t *dst_line, *dst;
2551     uint8_t *mask_line, *mask;
2552     int dst_stride, mask_stride;
2553     int32_t w;
2554     uint32_t src;
2555     uint8_t sa;
2556     __m64 vsrc, vsrca;
2557
2558     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2559     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2560
2561     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2562
2563     sa = src >> 24;
2564
2565     vsrc = load8888 (&src);
2566     vsrca = expand_alpha (vsrc);
2567
2568     while (height--)
2569     {
2570         dst = dst_line;
2571         dst_line += dst_stride;
2572         mask = mask_line;
2573         mask_line += mask_stride;
2574         w = width;
2575
2576         while (w && (unsigned long)dst & 7)
2577         {
2578             uint16_t tmp;
2579             uint8_t a;
2580             uint32_t m, d;
2581
2582             a = *mask++;
2583             d = *dst;
2584
2585             m = MUL_UN8 (sa, a, tmp);
2586             d = MUL_UN8 (m, d, tmp);
2587
2588             *dst++ = d;
2589             w--;
2590         }
2591
2592         while (w >= 4)
2593         {
2594             __m64 vmask;
2595             __m64 vdest;
2596
2597             vmask = load8888u ((uint32_t *)mask);
2598             vdest = load8888 ((uint32_t *)dst);
2599
2600             store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2601
2602             dst += 4;
2603             mask += 4;
2604             w -= 4;
2605         }
2606
2607         while (w--)
2608         {
2609             uint16_t tmp;
2610             uint8_t a;
2611             uint32_t m, d;
2612
2613             a = *mask++;
2614             d = *dst;
2615
2616             m = MUL_UN8 (sa, a, tmp);
2617             d = MUL_UN8 (m, d, tmp);
2618
2619             *dst++ = d;
2620         }
2621     }
2622
2623     _mm_empty ();
2624 }
2625
2626 static void
2627 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2628                       pixman_composite_info_t *info)
2629 {
2630     PIXMAN_COMPOSITE_ARGS (info);
2631     uint8_t     *dst_line, *dst;
2632     uint8_t     *src_line, *src;
2633     int src_stride, dst_stride;
2634     int32_t w;
2635
2636     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2637     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2638
2639     while (height--)
2640     {
2641         dst = dst_line;
2642         dst_line += dst_stride;
2643         src = src_line;
2644         src_line += src_stride;
2645         w = width;
2646
2647         while (w && (unsigned long)dst & 3)
2648         {
2649             uint8_t s, d;
2650             uint16_t tmp;
2651
2652             s = *src;
2653             d = *dst;
2654
2655             *dst = MUL_UN8 (s, d, tmp);
2656
2657             src++;
2658             dst++;
2659             w--;
2660         }
2661
2662         while (w >= 4)
2663         {
2664             uint32_t *s = (uint32_t *)src;
2665             uint32_t *d = (uint32_t *)dst;
2666
2667             store8888 (d, in (load8888u (s), load8888 (d)));
2668
2669             w -= 4;
2670             dst += 4;
2671             src += 4;
2672         }
2673
2674         while (w--)
2675         {
2676             uint8_t s, d;
2677             uint16_t tmp;
2678
2679             s = *src;
2680             d = *dst;
2681
2682             *dst = MUL_UN8 (s, d, tmp);
2683
2684             src++;
2685             dst++;
2686         }
2687     }
2688
2689     _mm_empty ();
2690 }
2691
2692 static void
2693 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2694                          pixman_composite_info_t *info)
2695 {
2696     PIXMAN_COMPOSITE_ARGS (info);
2697     uint8_t     *dst_line, *dst;
2698     uint8_t     *mask_line, *mask;
2699     int dst_stride, mask_stride;
2700     int32_t w;
2701     uint32_t src;
2702     uint8_t sa;
2703     __m64 vsrc, vsrca;
2704
2705     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2706     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2707
2708     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2709
2710     sa = src >> 24;
2711
2712     if (src == 0)
2713         return;
2714
2715     vsrc = load8888 (&src);
2716     vsrca = expand_alpha (vsrc);
2717
2718     while (height--)
2719     {
2720         dst = dst_line;
2721         dst_line += dst_stride;
2722         mask = mask_line;
2723         mask_line += mask_stride;
2724         w = width;
2725
2726         while (w && (unsigned long)dst & 3)
2727         {
2728             uint16_t tmp;
2729             uint16_t a;
2730             uint32_t m, d;
2731             uint32_t r;
2732
2733             a = *mask++;
2734             d = *dst;
2735
2736             m = MUL_UN8 (sa, a, tmp);
2737             r = ADD_UN8 (m, d, tmp);
2738
2739             *dst++ = r;
2740             w--;
2741         }
2742
2743         while (w >= 4)
2744         {
2745             __m64 vmask;
2746             __m64 vdest;
2747
2748             vmask = load8888u ((uint32_t *)mask);
2749             vdest = load8888 ((uint32_t *)dst);
2750
2751             store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
2752
2753             dst += 4;
2754             mask += 4;
2755             w -= 4;
2756         }
2757
2758         while (w--)
2759         {
2760             uint16_t tmp;
2761             uint16_t a;
2762             uint32_t m, d;
2763             uint32_t r;
2764
2765             a = *mask++;
2766             d = *dst;
2767
2768             m = MUL_UN8 (sa, a, tmp);
2769             r = ADD_UN8 (m, d, tmp);
2770
2771             *dst++ = r;
2772         }
2773     }
2774
2775     _mm_empty ();
2776 }
2777
2778 static void
2779 mmx_composite_add_8_8 (pixman_implementation_t *imp,
2780                        pixman_composite_info_t *info)
2781 {
2782     PIXMAN_COMPOSITE_ARGS (info);
2783     uint8_t *dst_line, *dst;
2784     uint8_t *src_line, *src;
2785     int dst_stride, src_stride;
2786     int32_t w;
2787     uint8_t s, d;
2788     uint16_t t;
2789
2790     CHECKPOINT ();
2791
2792     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2793     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2794
2795     while (height--)
2796     {
2797         dst = dst_line;
2798         dst_line += dst_stride;
2799         src = src_line;
2800         src_line += src_stride;
2801         w = width;
2802
2803         while (w && (unsigned long)dst & 7)
2804         {
2805             s = *src;
2806             d = *dst;
2807             t = d + s;
2808             s = t | (0 - (t >> 8));
2809             *dst = s;
2810
2811             dst++;
2812             src++;
2813             w--;
2814         }
2815
2816         while (w >= 8)
2817         {
2818             *(__m64*)dst = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
2819             dst += 8;
2820             src += 8;
2821             w -= 8;
2822         }
2823
2824         while (w)
2825         {
2826             s = *src;
2827             d = *dst;
2828             t = d + s;
2829             s = t | (0 - (t >> 8));
2830             *dst = s;
2831
2832             dst++;
2833             src++;
2834             w--;
2835         }
2836     }
2837
2838     _mm_empty ();
2839 }
2840
2841 static void
2842 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2843                              pixman_composite_info_t *info)
2844 {
2845     PIXMAN_COMPOSITE_ARGS (info);
2846     __m64 dst64;
2847     uint32_t    *dst_line, *dst;
2848     uint32_t    *src_line, *src;
2849     int dst_stride, src_stride;
2850     int32_t w;
2851
2852     CHECKPOINT ();
2853
2854     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2855     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2856
2857     while (height--)
2858     {
2859         dst = dst_line;
2860         dst_line += dst_stride;
2861         src = src_line;
2862         src_line += src_stride;
2863         w = width;
2864
2865         while (w && (unsigned long)dst & 7)
2866         {
2867             store (dst, _mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2868                                       _mm_cvtsi32_si64 (*dst)));
2869             dst++;
2870             src++;
2871             w--;
2872         }
2873
2874         while (w >= 2)
2875         {
2876             dst64 = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
2877             *(uint64_t*)dst = to_uint64 (dst64);
2878             dst += 2;
2879             src += 2;
2880             w -= 2;
2881         }
2882
2883         if (w)
2884         {
2885             store (dst, _mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2886                                       _mm_cvtsi32_si64 (*dst)));
2887
2888         }
2889     }
2890
2891     _mm_empty ();
2892 }
2893
2894 static pixman_bool_t
2895 pixman_blt_mmx (uint32_t *src_bits,
2896                 uint32_t *dst_bits,
2897                 int       src_stride,
2898                 int       dst_stride,
2899                 int       src_bpp,
2900                 int       dst_bpp,
2901                 int       src_x,
2902                 int       src_y,
2903                 int       dest_x,
2904                 int       dest_y,
2905                 int       width,
2906                 int       height)
2907 {
2908     uint8_t *   src_bytes;
2909     uint8_t *   dst_bytes;
2910     int byte_width;
2911
2912     if (src_bpp != dst_bpp)
2913         return FALSE;
2914
2915     if (src_bpp == 16)
2916     {
2917         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
2918         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
2919         src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
2920         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
2921         byte_width = 2 * width;
2922         src_stride *= 2;
2923         dst_stride *= 2;
2924     }
2925     else if (src_bpp == 32)
2926     {
2927         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
2928         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
2929         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
2930         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
2931         byte_width = 4 * width;
2932         src_stride *= 4;
2933         dst_stride *= 4;
2934     }
2935     else
2936     {
2937         return FALSE;
2938     }
2939
2940     while (height--)
2941     {
2942         int w;
2943         uint8_t *s = src_bytes;
2944         uint8_t *d = dst_bytes;
2945         src_bytes += src_stride;
2946         dst_bytes += dst_stride;
2947         w = byte_width;
2948
2949         if (w >= 1 && ((unsigned long)d & 1))
2950         {
2951             *(uint8_t *)d = *(uint8_t *)s;
2952             w -= 1;
2953             s += 1;
2954             d += 1;
2955         }
2956
2957         if (w >= 2 && ((unsigned long)d & 3))
2958         {
2959             *(uint16_t *)d = *(uint16_t *)s;
2960             w -= 2;
2961             s += 2;
2962             d += 2;
2963         }
2964
2965         while (w >= 4 && ((unsigned long)d & 7))
2966         {
2967             *(uint32_t *)d = ldl_u((uint32_t *)s);
2968
2969             w -= 4;
2970             s += 4;
2971             d += 4;
2972         }
2973
2974         while (w >= 64)
2975         {
2976 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
2977             __asm__ (
2978                 "movq     (%1),   %%mm0\n"
2979                 "movq    8(%1),   %%mm1\n"
2980                 "movq   16(%1),   %%mm2\n"
2981                 "movq   24(%1),   %%mm3\n"
2982                 "movq   32(%1),   %%mm4\n"
2983                 "movq   40(%1),   %%mm5\n"
2984                 "movq   48(%1),   %%mm6\n"
2985                 "movq   56(%1),   %%mm7\n"
2986
2987                 "movq   %%mm0,    (%0)\n"
2988                 "movq   %%mm1,   8(%0)\n"
2989                 "movq   %%mm2,  16(%0)\n"
2990                 "movq   %%mm3,  24(%0)\n"
2991                 "movq   %%mm4,  32(%0)\n"
2992                 "movq   %%mm5,  40(%0)\n"
2993                 "movq   %%mm6,  48(%0)\n"
2994                 "movq   %%mm7,  56(%0)\n"
2995                 :
2996                 : "r" (d), "r" (s)
2997                 : "memory",
2998                   "%mm0", "%mm1", "%mm2", "%mm3",
2999                   "%mm4", "%mm5", "%mm6", "%mm7");
3000 #else
3001             __m64 v0 = ldq_u((uint64_t *)(s + 0));
3002             __m64 v1 = ldq_u((uint64_t *)(s + 8));
3003             __m64 v2 = ldq_u((uint64_t *)(s + 16));
3004             __m64 v3 = ldq_u((uint64_t *)(s + 24));
3005             __m64 v4 = ldq_u((uint64_t *)(s + 32));
3006             __m64 v5 = ldq_u((uint64_t *)(s + 40));
3007             __m64 v6 = ldq_u((uint64_t *)(s + 48));
3008             __m64 v7 = ldq_u((uint64_t *)(s + 56));
3009             *(__m64 *)(d + 0)  = v0;
3010             *(__m64 *)(d + 8)  = v1;
3011             *(__m64 *)(d + 16) = v2;
3012             *(__m64 *)(d + 24) = v3;
3013             *(__m64 *)(d + 32) = v4;
3014             *(__m64 *)(d + 40) = v5;
3015             *(__m64 *)(d + 48) = v6;
3016             *(__m64 *)(d + 56) = v7;
3017 #endif
3018
3019             w -= 64;
3020             s += 64;
3021             d += 64;
3022         }
3023         while (w >= 4)
3024         {
3025             *(uint32_t *)d = ldl_u((uint32_t *)s);
3026
3027             w -= 4;
3028             s += 4;
3029             d += 4;
3030         }
3031         if (w >= 2)
3032         {
3033             *(uint16_t *)d = *(uint16_t *)s;
3034             w -= 2;
3035             s += 2;
3036             d += 2;
3037         }
3038     }
3039
3040     _mm_empty ();
3041
3042     return TRUE;
3043 }
3044
3045 static void
3046 mmx_composite_copy_area (pixman_implementation_t *imp,
3047                          pixman_composite_info_t *info)
3048 {
3049     PIXMAN_COMPOSITE_ARGS (info);
3050
3051     pixman_blt_mmx (src_image->bits.bits,
3052                     dest_image->bits.bits,
3053                     src_image->bits.rowstride,
3054                     dest_image->bits.rowstride,
3055                     PIXMAN_FORMAT_BPP (src_image->bits.format),
3056                     PIXMAN_FORMAT_BPP (dest_image->bits.format),
3057                     src_x, src_y, dest_x, dest_y, width, height);
3058 }
3059
3060 static void
3061 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3062                                 pixman_composite_info_t *info)
3063 {
3064     PIXMAN_COMPOSITE_ARGS (info);
3065     uint32_t  *src, *src_line;
3066     uint32_t  *dst, *dst_line;
3067     uint8_t  *mask, *mask_line;
3068     int src_stride, mask_stride, dst_stride;
3069     int32_t w;
3070
3071     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3072     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3073     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3074
3075     while (height--)
3076     {
3077         src = src_line;
3078         src_line += src_stride;
3079         dst = dst_line;
3080         dst_line += dst_stride;
3081         mask = mask_line;
3082         mask_line += mask_stride;
3083
3084         w = width;
3085
3086         while (w--)
3087         {
3088             uint64_t m = *mask;
3089
3090             if (m)
3091             {
3092                 uint32_t ssrc = *src | 0xff000000;
3093                 __m64 s = load8888 (&ssrc);
3094
3095                 if (m == 0xff)
3096                 {
3097                     store8888 (dst, s);
3098                 }
3099                 else
3100                 {
3101                     __m64 sa = expand_alpha (s);
3102                     __m64 vm = expand_alpha_rev (to_m64 (m));
3103                     __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3104
3105                     store8888 (dst, vdest);
3106                 }
3107             }
3108
3109             mask++;
3110             dst++;
3111             src++;
3112         }
3113     }
3114
3115     _mm_empty ();
3116 }
3117
3118 static const pixman_fast_path_t mmx_fast_paths[] =
3119 {
3120     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
3121     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
3122     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
3123     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
3124     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
3125     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
3126     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3127     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3128     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
3129     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3130     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3131     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
3132     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3133     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3134     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
3135     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3136     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3137     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
3138     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
3139     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
3140     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
3141     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
3142     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
3143     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
3144     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
3145     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
3146     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
3147     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
3148     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
3149     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
3150     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
3151     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
3152     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
3153     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
3154     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3155     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3156
3157     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
3158     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
3159     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
3160     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
3161     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
3162     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
3163
3164     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
3165     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
3166     PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8             ),
3167     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
3168
3169     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
3170     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
3171     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
3172     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
3173     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
3174     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
3175     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3176     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3177     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3178     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3179     PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
3180     PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
3181
3182     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
3183     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
3184
3185     { PIXMAN_OP_NONE },
3186 };
3187
3188 static pixman_bool_t
3189 mmx_blt (pixman_implementation_t *imp,
3190          uint32_t *               src_bits,
3191          uint32_t *               dst_bits,
3192          int                      src_stride,
3193          int                      dst_stride,
3194          int                      src_bpp,
3195          int                      dst_bpp,
3196          int                      src_x,
3197          int                      src_y,
3198          int                      dest_x,
3199          int                      dest_y,
3200          int                      width,
3201          int                      height)
3202 {
3203     if (!pixman_blt_mmx (
3204             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3205             src_x, src_y, dest_x, dest_y, width, height))
3206
3207     {
3208         return _pixman_implementation_blt (
3209             imp->delegate,
3210             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3211             src_x, src_y, dest_x, dest_y, width, height);
3212     }
3213
3214     return TRUE;
3215 }
3216
3217 static pixman_bool_t
3218 mmx_fill (pixman_implementation_t *imp,
3219           uint32_t *               bits,
3220           int                      stride,
3221           int                      bpp,
3222           int                      x,
3223           int                      y,
3224           int                      width,
3225           int                      height,
3226           uint32_t xor)
3227 {
3228     if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
3229     {
3230         return _pixman_implementation_fill (
3231             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
3232     }
3233
3234     return TRUE;
3235 }
3236
3237 pixman_implementation_t *
3238 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
3239 {
3240     pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
3241
3242     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
3243     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
3244     imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
3245     imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
3246     imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
3247     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
3248     imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
3249     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
3250     imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
3251     imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
3252     imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
3253
3254     imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
3255     imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
3256     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
3257     imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
3258     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
3259     imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
3260     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
3261     imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
3262     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
3263     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
3264     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
3265
3266     imp->blt = mmx_blt;
3267     imp->fill = mmx_fill;
3268
3269     return imp;
3270 }
3271
3272 #endif /* USE_X86_MMX || USE_ARM_IWMMXT */