pixman/pixman-mmx.c

   1 /*
   2  * Copyright © 2004, 2005 Red Hat, Inc.
   3  * Copyright © 2004 Nicholas Miell
   4  * Copyright © 2005 Trolltech AS
   5  *
   6  * Permission to use, copy, modify, distribute, and sell this software and its
   7  * documentation for any purpose is hereby granted without fee, provided that
   8  * the above copyright notice appear in all copies and that both that
   9  * copyright notice and this permission notice appear in supporting
  10  * documentation, and that the name of Red Hat not be used in advertising or
  11  * publicity pertaining to distribution of the software without specific,
  12  * written prior permission.  Red Hat makes no representations about the
  13  * suitability of this software for any purpose.  It is provided "as is"
  14  * without express or implied warranty.
  15  *
  16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  23  * SOFTWARE.
  24  *
  25  * Author:  Søren Sandmann (sandmann@redhat.com)
  26  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
  27  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
  28  *
  29  * Based on work by Owen Taylor
  30  */
  31
  32 #ifdef HAVE_CONFIG_H
  33 #include <config.h>
  34 #endif
  35
  36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT
  37
  38 #include <mmintrin.h>
  39 #include "pixman-private.h"
  40 #include "pixman-combine32.h"
  41
  42 #define no_vERBOSE
  43
  44 #ifdef VERBOSE
  45 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
  46 #else
  47 #define CHECKPOINT()
  48 #endif
  49
  50 #ifdef USE_ARM_IWMMXT
  51 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
  52 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  53 _mm_empty (void)
  54 {
  55
  56 }
  57 #endif
  58
  59 #ifdef USE_X86_MMX
  60 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
  61  * instructions to be generated that we don't want. Just duplicate the
  62  * functions we want to use.  */
  63 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  64 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
  65 {
  66     asm("pmulhuw %1, %0\n\t"
  67         : "+y" (__A)
  68         : "y" (__B)
  69     );
  70     return __A;
  71 }
  72
  73 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  74 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
  75 {
  76     __m64 ret;
  77
  78     asm("pshufw %2, %1, %0\n\t"
  79         : "=y" (ret)
  80         : "y" (__A), "K" (__N)
  81     );
  82
  83     return ret;
  84 }
  85 #endif
  86
  87 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
  88  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
  89
  90 /* Notes about writing mmx code
  91  *
  92  * give memory operands as the second operand. If you give it as the
  93  * first, gcc will first load it into a register, then use that
  94  * register
  95  *
  96  *   ie. use
  97  *
  98  *         _mm_mullo_pi16 (x, mmx_constant);
  99  *
 100  *   not
 101  *
 102  *         _mm_mullo_pi16 (mmx_constant, x);
 103  *
 104  * Also try to minimize dependencies. i.e. when you need a value, try
 105  * to calculate it from a value that was calculated as early as
 106  * possible.
 107  */
 108
 109 /* --------------- MMX primitives ------------------------------------- */
 110
 111 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
 112  * the name of the member used to access the data.
 113  * If __m64 requires using mm_cvt* intrinsics functions to convert between
 114  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
 115  * If __m64 and uint64_t values can just be cast to each other directly,
 116  * then define USE_M64_CASTS.
 117  */
 118 #ifdef _MSC_VER
 119 # define M64_MEMBER m64_u64
 120 #elif defined(__ICC)
 121 # define USE_CVT_INTRINSICS
 122 #elif defined(__GNUC__)
 123 # define USE_M64_CASTS
 124 #elif defined(__SUNPRO_C)
 125 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
 126 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
 127  * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
 128  * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
 129  */
 130 #  define USE_CVT_INTRINSICS
 131 # else
 132 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
 133  * disabled, __m64 is defined as a struct containing "unsigned long long l_".
 134  */
 135 #  define M64_MEMBER l_
 136 # endif
 137 #endif
 138
 139 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS)
 140 typedef uint64_t mmxdatafield;
 141 #else
 142 typedef __m64 mmxdatafield;
 143 #endif
 144
 145 typedef struct
 146 {
 147     mmxdatafield mmx_4x00ff;
 148     mmxdatafield mmx_4x0080;
 149     mmxdatafield mmx_565_rgb;
 150     mmxdatafield mmx_565_unpack_multiplier;
 151     mmxdatafield mmx_565_r;
 152     mmxdatafield mmx_565_g;
 153     mmxdatafield mmx_565_b;
 154     mmxdatafield mmx_mask_0;
 155     mmxdatafield mmx_mask_1;
 156     mmxdatafield mmx_mask_2;
 157     mmxdatafield mmx_mask_3;
 158     mmxdatafield mmx_full_alpha;
 159     mmxdatafield mmx_4x0101;
 160 } mmx_data_t;
 161
 162 #if defined(_MSC_VER)
 163 # define MMXDATA_INIT(field, val) { val ## UI64 }
 164 #elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
 165 # define MMXDATA_INIT(field, val) field =   { val ## ULL }
 166 #else                           /* mmxdatafield is an integral type */
 167 # define MMXDATA_INIT(field, val) field =   val ## ULL
 168 #endif
 169
 170 static const mmx_data_t c =
 171 {
 172     MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
 173     MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
 174     MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
 175     MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
 176     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
 177     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
 178     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
 179     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
 180     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
 181     MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
 182     MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
 183     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
 184     MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
 185 };
 186
 187 #ifdef USE_CVT_INTRINSICS
 188 #    define MC(x) to_m64 (c.mmx_ ## x)
 189 #elif defined(USE_M64_CASTS)
 190 #    define MC(x) ((__m64)c.mmx_ ## x)
 191 #else
 192 #    define MC(x) c.mmx_ ## x
 193 #endif
 194
 195 static force_inline __m64
 196 to_m64 (uint64_t x)
 197 {
 198 #ifdef USE_CVT_INTRINSICS
 199     return _mm_cvtsi64_m64 (x);
 200 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
 201     __m64 res;
 202
 203     res.M64_MEMBER = x;
 204     return res;
 205 #else /* USE_M64_CASTS */
 206     return (__m64)x;
 207 #endif
 208 }
 209
 210 static force_inline uint64_t
 211 to_uint64 (__m64 x)
 212 {
 213 #ifdef USE_CVT_INTRINSICS
 214     return _mm_cvtm64_si64 (x);
 215 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
 216     uint64_t res = x.M64_MEMBER;
 217     return res;
 218 #else /* USE_M64_CASTS */
 219     return (uint64_t)x;
 220 #endif
 221 }
 222
 223 static force_inline __m64
 224 shift (__m64 v,
 225        int   s)
 226 {
 227     if (s > 0)
 228         return _mm_slli_si64 (v, s);
 229     else if (s < 0)
 230         return _mm_srli_si64 (v, -s);
 231     else
 232         return v;
 233 }
 234
 235 static force_inline __m64
 236 negate (__m64 mask)
 237 {
 238     return _mm_xor_si64 (mask, MC (4x00ff));
 239 }
 240
 241 static force_inline __m64
 242 pix_multiply (__m64 a, __m64 b)
 243 {
 244     __m64 res;
 245
 246     res = _mm_mullo_pi16 (a, b);
 247     res = _mm_adds_pu16 (res, MC (4x0080));
 248     res = _mm_mulhi_pu16 (res, MC (4x0101));
 249
 250     return res;
 251 }
 252
 253 static force_inline __m64
 254 pix_add (__m64 a, __m64 b)
 255 {
 256     return _mm_adds_pu8 (a, b);
 257 }
 258
 259 static force_inline __m64
 260 expand_alpha (__m64 pixel)
 261 {
 262     return _mm_shuffle_pi16(pixel, _MM_SHUFFLE (3, 3, 3, 3));
 263 }
 264
 265 static force_inline __m64
 266 expand_alpha_rev (__m64 pixel)
 267 {
 268     return _mm_shuffle_pi16(pixel, _MM_SHUFFLE (0, 0, 0, 0));
 269 }
 270
 271 static force_inline __m64
 272 invert_colors (__m64 pixel)
 273 {
 274     return _mm_shuffle_pi16(pixel, _MM_SHUFFLE (3, 0, 1, 2));
 275 }
 276
 277 static force_inline __m64
 278 over (__m64 src,
 279       __m64 srca,
 280       __m64 dest)
 281 {
 282     return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
 283 }
 284
 285 static force_inline __m64
 286 over_rev_non_pre (__m64 src, __m64 dest)
 287 {
 288     __m64 srca = expand_alpha (src);
 289     __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
 290
 291     return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
 292 }
 293
 294 static force_inline __m64
 295 in (__m64 src, __m64 mask)
 296 {
 297     return pix_multiply (src, mask);
 298 }
 299
 300 static force_inline __m64
 301 in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
 302 {
 303     src = _mm_or_si64 (src, MC (full_alpha));
 304
 305     return over (in (src, mask), mask, dest);
 306 }
 307
 308 #ifndef _MSC_VER
 309 static force_inline __m64
 310 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
 311 {
 312     return over (in (src, mask), pix_multiply (srca, mask), dest);
 313 }
 314
 315 #else
 316
 317 #define in_over(src, srca, mask, dest)                                  \
 318     over (in (src, mask), pix_multiply (srca, mask), dest)
 319
 320 #endif
 321
 322 /* Elemental unaligned loads */
 323
 324 static __inline__ __m64 ldq_u(uint64_t *p)
 325 {
 326 #ifdef USE_X86_MMX
 327     /* x86's alignment restrictions are very relaxed. */
 328     return *(__m64 *)p;
 329 #elif defined USE_ARM_IWMMXT
 330     int align = (uintptr_t)p & 7;
 331     __m64 *aligned_p;
 332     if (align == 0)
 333         return *p;
 334     aligned_p = (__m64 *)((uintptr_t)p & ~7);
 335     return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
 336 #else
 337     struct __una_u64 { uint64_t x __attribute__((packed)); };
 338     const struct __una_u64 *ptr = (const struct __una_u64 *) p;
 339     return (__m64) ptr->x;
 340 #endif
 341 }
 342
 343 static __inline__ uint32_t ldl_u(uint32_t *p)
 344 {
 345 #ifdef USE_X86_MMX
 346     /* x86's alignment restrictions are very relaxed. */
 347     return *p;
 348 #else
 349     struct __una_u32 { uint32_t x __attribute__((packed)); };
 350     const struct __una_u32 *ptr = (const struct __una_u32 *) p;
 351     return ptr->x;
 352 #endif
 353 }
 354
 355 static force_inline __m64
 356 load8888 (uint32_t v)
 357 {
 358     return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
 359 }
 360
 361 static force_inline __m64
 362 pack8888 (__m64 lo, __m64 hi)
 363 {
 364     return _mm_packs_pu16 (lo, hi);
 365 }
 366
 367 static force_inline uint32_t
 368 store8888 (__m64 v)
 369 {
 370     return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
 371 }
 372
 373 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
 374  *
 375  *    00RR00GG00BB
 376  *
 377  * --- Expanding 565 in the low word ---
 378  *
 379  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
 380  * m = m & (01f0003f001f);
 381  * m = m * (008404100840);
 382  * m = m >> 8;
 383  *
 384  * Note the trick here - the top word is shifted by another nibble to
 385  * avoid it bumping into the middle word
 386  */
 387 static force_inline __m64
 388 expand565 (__m64 pixel, int pos)
 389 {
 390     __m64 p = pixel;
 391     __m64 t1, t2;
 392
 393     /* move pixel to low 16 bit and zero the rest */
 394     p = shift (shift (p, (3 - pos) * 16), -48);
 395
 396     t1 = shift (p, 36 - 11);
 397     t2 = shift (p, 16 - 5);
 398
 399     p = _mm_or_si64 (t1, p);
 400     p = _mm_or_si64 (t2, p);
 401     p = _mm_and_si64 (p, MC (565_rgb));
 402
 403     pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
 404     return _mm_srli_pi16 (pixel, 8);
 405 }
 406
 407 static force_inline __m64
 408 expand8888 (__m64 in, int pos)
 409 {
 410     if (pos == 0)
 411         return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
 412     else
 413         return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
 414 }
 415
 416 static force_inline __m64
 417 expandx888 (__m64 in, int pos)
 418 {
 419     return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
 420 }
 421
 422 static force_inline __m64
 423 pack_565 (__m64 pixel, __m64 target, int pos)
 424 {
 425     __m64 p = pixel;
 426     __m64 t = target;
 427     __m64 r, g, b;
 428
 429     r = _mm_and_si64 (p, MC (565_r));
 430     g = _mm_and_si64 (p, MC (565_g));
 431     b = _mm_and_si64 (p, MC (565_b));
 432
 433     r = shift (r, -(32 - 8) + pos * 16);
 434     g = shift (g, -(16 - 3) + pos * 16);
 435     b = shift (b, -(0  + 3) + pos * 16);
 436
 437     if (pos == 0)
 438         t = _mm_and_si64 (t, MC (mask_0));
 439     else if (pos == 1)
 440         t = _mm_and_si64 (t, MC (mask_1));
 441     else if (pos == 2)
 442         t = _mm_and_si64 (t, MC (mask_2));
 443     else if (pos == 3)
 444         t = _mm_and_si64 (t, MC (mask_3));
 445
 446     p = _mm_or_si64 (r, t);
 447     p = _mm_or_si64 (g, p);
 448
 449     return _mm_or_si64 (b, p);
 450 }
 451
 452 #ifndef _MSC_VER
 453
 454 static force_inline __m64
 455 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 456 {
 457     x = pix_multiply (x, a);
 458     y = pix_multiply (y, b);
 459
 460     return pix_add (x, y);
 461 }
 462
 463 #else
 464
 465 #define pix_add_mul(x, a, y, b)  \
 466     ( x = pix_multiply (x, a),   \
 467       y = pix_multiply (y, a),   \
 468       pix_add (x, y) )
 469
 470 #endif
 471
 472 /* --------------- MMX code patch for fbcompose.c --------------------- */
 473
 474 static force_inline uint32_t
 475 combine (const uint32_t *src, const uint32_t *mask)
 476 {
 477     uint32_t ssrc = *src;
 478
 479     if (mask)
 480     {
 481         __m64 m = load8888 (*mask);
 482         __m64 s = load8888 (ssrc);
 483
 484         m = expand_alpha (m);
 485         s = pix_multiply (s, m);
 486
 487         ssrc = store8888 (s);
 488     }
 489
 490     return ssrc;
 491 }
 492
 493 static void
 494 mmx_combine_over_u (pixman_implementation_t *imp,
 495                     pixman_op_t              op,
 496                     uint32_t *               dest,
 497                     const uint32_t *         src,
 498                     const uint32_t *         mask,
 499                     int                      width)
 500 {
 501     const uint32_t *end = dest + width;
 502
 503     while (dest < end)
 504     {
 505         uint32_t ssrc = combine (src, mask);
 506         uint32_t a = ssrc >> 24;
 507
 508         if (a == 0xff)
 509         {
 510             *dest = ssrc;
 511         }
 512         else if (ssrc)
 513         {
 514             __m64 s, sa;
 515             s = load8888 (ssrc);
 516             sa = expand_alpha (s);
 517             *dest = store8888 (over (s, sa, load8888 (*dest)));
 518         }
 519
 520         ++dest;
 521         ++src;
 522         if (mask)
 523             ++mask;
 524     }
 525     _mm_empty ();
 526 }
 527
 528 static void
 529 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
 530                             pixman_op_t              op,
 531                             uint32_t *               dest,
 532                             const uint32_t *         src,
 533                             const uint32_t *         mask,
 534                             int                      width)
 535 {
 536     const uint32_t *end = dest + width;
 537
 538     while (dest < end)
 539     {
 540         __m64 d, da;
 541         uint32_t s = combine (src, mask);
 542
 543         d = load8888 (*dest);
 544         da = expand_alpha (d);
 545         *dest = store8888 (over (d, da, load8888 (s)));
 546
 547         ++dest;
 548         ++src;
 549         if (mask)
 550             mask++;
 551     }
 552     _mm_empty ();
 553 }
 554
 555 static void
 556 mmx_combine_in_u (pixman_implementation_t *imp,
 557                   pixman_op_t              op,
 558                   uint32_t *               dest,
 559                   const uint32_t *         src,
 560                   const uint32_t *         mask,
 561                   int                      width)
 562 {
 563     const uint32_t *end = dest + width;
 564
 565     while (dest < end)
 566     {
 567         __m64 x, a;
 568
 569         x = load8888 (combine (src, mask));
 570         a = load8888 (*dest);
 571         a = expand_alpha (a);
 572         x = pix_multiply (x, a);
 573
 574         *dest = store8888 (x);
 575
 576         ++dest;
 577         ++src;
 578         if (mask)
 579             mask++;
 580     }
 581     _mm_empty ();
 582 }
 583
 584 static void
 585 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
 586                           pixman_op_t              op,
 587                           uint32_t *               dest,
 588                           const uint32_t *         src,
 589                           const uint32_t *         mask,
 590                           int                      width)
 591 {
 592     const uint32_t *end = dest + width;
 593
 594     while (dest < end)
 595     {
 596         __m64 x, a;
 597
 598         x = load8888 (*dest);
 599         a = load8888 (combine (src, mask));
 600         a = expand_alpha (a);
 601         x = pix_multiply (x, a);
 602         *dest = store8888 (x);
 603
 604         ++dest;
 605         ++src;
 606         if (mask)
 607             mask++;
 608     }
 609     _mm_empty ();
 610 }
 611
 612 static void
 613 mmx_combine_out_u (pixman_implementation_t *imp,
 614                    pixman_op_t              op,
 615                    uint32_t *               dest,
 616                    const uint32_t *         src,
 617                    const uint32_t *         mask,
 618                    int                      width)
 619 {
 620     const uint32_t *end = dest + width;
 621
 622     while (dest < end)
 623     {
 624         __m64 x, a;
 625
 626         x = load8888 (combine (src, mask));
 627         a = load8888 (*dest);
 628         a = expand_alpha (a);
 629         a = negate (a);
 630         x = pix_multiply (x, a);
 631         *dest = store8888 (x);
 632
 633         ++dest;
 634         ++src;
 635         if (mask)
 636             mask++;
 637     }
 638     _mm_empty ();
 639 }
 640
 641 static void
 642 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
 643                            pixman_op_t              op,
 644                            uint32_t *               dest,
 645                            const uint32_t *         src,
 646                            const uint32_t *         mask,
 647                            int                      width)
 648 {
 649     const uint32_t *end = dest + width;
 650
 651     while (dest < end)
 652     {
 653         __m64 x, a;
 654
 655         x = load8888 (*dest);
 656         a = load8888 (combine (src, mask));
 657         a = expand_alpha (a);
 658         a = negate (a);
 659         x = pix_multiply (x, a);
 660
 661         *dest = store8888 (x);
 662
 663         ++dest;
 664         ++src;
 665         if (mask)
 666             mask++;
 667     }
 668     _mm_empty ();
 669 }
 670
 671 static void
 672 mmx_combine_atop_u (pixman_implementation_t *imp,
 673                     pixman_op_t              op,
 674                     uint32_t *               dest,
 675                     const uint32_t *         src,
 676                     const uint32_t *         mask,
 677                     int                      width)
 678 {
 679     const uint32_t *end = dest + width;
 680
 681     while (dest < end)
 682     {
 683         __m64 s, da, d, sia;
 684
 685         s = load8888 (combine (src, mask));
 686         d = load8888 (*dest);
 687         sia = expand_alpha (s);
 688         sia = negate (sia);
 689         da = expand_alpha (d);
 690         s = pix_add_mul (s, da, d, sia);
 691         *dest = store8888 (s);
 692
 693         ++dest;
 694         ++src;
 695         if (mask)
 696             mask++;
 697     }
 698     _mm_empty ();
 699 }
 700
 701 static void
 702 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
 703                             pixman_op_t              op,
 704                             uint32_t *               dest,
 705                             const uint32_t *         src,
 706                             const uint32_t *         mask,
 707                             int                      width)
 708 {
 709     const uint32_t *end;
 710
 711     end = dest + width;
 712
 713     while (dest < end)
 714     {
 715         __m64 s, dia, d, sa;
 716
 717         s = load8888 (combine (src, mask));
 718         d = load8888 (*dest);
 719         sa = expand_alpha (s);
 720         dia = expand_alpha (d);
 721         dia = negate (dia);
 722         s = pix_add_mul (s, dia, d, sa);
 723         *dest = store8888 (s);
 724
 725         ++dest;
 726         ++src;
 727         if (mask)
 728             mask++;
 729     }
 730     _mm_empty ();
 731 }
 732
 733 static void
 734 mmx_combine_xor_u (pixman_implementation_t *imp,
 735                    pixman_op_t              op,
 736                    uint32_t *               dest,
 737                    const uint32_t *         src,
 738                    const uint32_t *         mask,
 739                    int                      width)
 740 {
 741     const uint32_t *end = dest + width;
 742
 743     while (dest < end)
 744     {
 745         __m64 s, dia, d, sia;
 746
 747         s = load8888 (combine (src, mask));
 748         d = load8888 (*dest);
 749         sia = expand_alpha (s);
 750         dia = expand_alpha (d);
 751         sia = negate (sia);
 752         dia = negate (dia);
 753         s = pix_add_mul (s, dia, d, sia);
 754         *dest = store8888 (s);
 755
 756         ++dest;
 757         ++src;
 758         if (mask)
 759             mask++;
 760     }
 761     _mm_empty ();
 762 }
 763
 764 static void
 765 mmx_combine_add_u (pixman_implementation_t *imp,
 766                    pixman_op_t              op,
 767                    uint32_t *               dest,
 768                    const uint32_t *         src,
 769                    const uint32_t *         mask,
 770                    int                      width)
 771 {
 772     const uint32_t *end = dest + width;
 773
 774     while (dest < end)
 775     {
 776         __m64 s, d;
 777
 778         s = load8888 (combine (src, mask));
 779         d = load8888 (*dest);
 780         s = pix_add (s, d);
 781         *dest = store8888 (s);
 782
 783         ++dest;
 784         ++src;
 785         if (mask)
 786             mask++;
 787     }
 788     _mm_empty ();
 789 }
 790
 791 static void
 792 mmx_combine_saturate_u (pixman_implementation_t *imp,
 793                         pixman_op_t              op,
 794                         uint32_t *               dest,
 795                         const uint32_t *         src,
 796                         const uint32_t *         mask,
 797                         int                      width)
 798 {
 799     const uint32_t *end = dest + width;
 800
 801     while (dest < end)
 802     {
 803         uint32_t s = combine (src, mask);
 804         uint32_t d = *dest;
 805         __m64 ms = load8888 (s);
 806         __m64 md = load8888 (d);
 807         uint32_t sa = s >> 24;
 808         uint32_t da = ~d >> 24;
 809
 810         if (sa > da)
 811         {
 812             __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
 813             msa = expand_alpha (msa);
 814             ms = pix_multiply (ms, msa);
 815         }
 816
 817         md = pix_add (md, ms);
 818         *dest = store8888 (md);
 819
 820         ++src;
 821         ++dest;
 822         if (mask)
 823             mask++;
 824     }
 825     _mm_empty ();
 826 }
 827
 828 static void
 829 mmx_combine_src_ca (pixman_implementation_t *imp,
 830                     pixman_op_t              op,
 831                     uint32_t *               dest,
 832                     const uint32_t *         src,
 833                     const uint32_t *         mask,
 834                     int                      width)
 835 {
 836     const uint32_t *end = src + width;
 837
 838     while (src < end)
 839     {
 840         __m64 a = load8888 (*mask);
 841         __m64 s = load8888 (*src);
 842
 843         s = pix_multiply (s, a);
 844         *dest = store8888 (s);
 845
 846         ++src;
 847         ++mask;
 848         ++dest;
 849     }
 850     _mm_empty ();
 851 }
 852
 853 static void
 854 mmx_combine_over_ca (pixman_implementation_t *imp,
 855                      pixman_op_t              op,
 856                      uint32_t *               dest,
 857                      const uint32_t *         src,
 858                      const uint32_t *         mask,
 859                      int                      width)
 860 {
 861     const uint32_t *end = src + width;
 862
 863     while (src < end)
 864     {
 865         __m64 a = load8888 (*mask);
 866         __m64 s = load8888 (*src);
 867         __m64 d = load8888 (*dest);
 868         __m64 sa = expand_alpha (s);
 869
 870         *dest = store8888 (in_over (s, sa, a, d));
 871
 872         ++src;
 873         ++dest;
 874         ++mask;
 875     }
 876     _mm_empty ();
 877 }
 878
 879 static void
 880 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
 881                              pixman_op_t              op,
 882                              uint32_t *               dest,
 883                              const uint32_t *         src,
 884                              const uint32_t *         mask,
 885                              int                      width)
 886 {
 887     const uint32_t *end = src + width;
 888
 889     while (src < end)
 890     {
 891         __m64 a = load8888 (*mask);
 892         __m64 s = load8888 (*src);
 893         __m64 d = load8888 (*dest);
 894         __m64 da = expand_alpha (d);
 895
 896         *dest = store8888 (over (d, da, in (s, a)));
 897
 898         ++src;
 899         ++dest;
 900         ++mask;
 901     }
 902     _mm_empty ();
 903 }
 904
 905 static void
 906 mmx_combine_in_ca (pixman_implementation_t *imp,
 907                    pixman_op_t              op,
 908                    uint32_t *               dest,
 909                    const uint32_t *         src,
 910                    const uint32_t *         mask,
 911                    int                      width)
 912 {
 913     const uint32_t *end = src + width;
 914
 915     while (src < end)
 916     {
 917         __m64 a = load8888 (*mask);
 918         __m64 s = load8888 (*src);
 919         __m64 d = load8888 (*dest);
 920         __m64 da = expand_alpha (d);
 921
 922         s = pix_multiply (s, a);
 923         s = pix_multiply (s, da);
 924         *dest = store8888 (s);
 925
 926         ++src;
 927         ++dest;
 928         ++mask;
 929     }
 930     _mm_empty ();
 931 }
 932
 933 static void
 934 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
 935                            pixman_op_t              op,
 936                            uint32_t *               dest,
 937                            const uint32_t *         src,
 938                            const uint32_t *         mask,
 939                            int                      width)
 940 {
 941     const uint32_t *end = src + width;
 942
 943     while (src < end)
 944     {
 945         __m64 a = load8888 (*mask);
 946         __m64 s = load8888 (*src);
 947         __m64 d = load8888 (*dest);
 948         __m64 sa = expand_alpha (s);
 949
 950         a = pix_multiply (a, sa);
 951         d = pix_multiply (d, a);
 952         *dest = store8888 (d);
 953
 954         ++src;
 955         ++dest;
 956         ++mask;
 957     }
 958     _mm_empty ();
 959 }
 960
 961 static void
 962 mmx_combine_out_ca (pixman_implementation_t *imp,
 963                     pixman_op_t              op,
 964                     uint32_t *               dest,
 965                     const uint32_t *         src,
 966                     const uint32_t *         mask,
 967                     int                      width)
 968 {
 969     const uint32_t *end = src + width;
 970
 971     while (src < end)
 972     {
 973         __m64 a = load8888 (*mask);
 974         __m64 s = load8888 (*src);
 975         __m64 d = load8888 (*dest);
 976         __m64 da = expand_alpha (d);
 977
 978         da = negate (da);
 979         s = pix_multiply (s, a);
 980         s = pix_multiply (s, da);
 981         *dest = store8888 (s);
 982
 983         ++src;
 984         ++dest;
 985         ++mask;
 986     }
 987     _mm_empty ();
 988 }
 989
 990 static void
 991 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
 992                             pixman_op_t              op,
 993                             uint32_t *               dest,
 994                             const uint32_t *         src,
 995                             const uint32_t *         mask,
 996                             int                      width)
 997 {
 998     const uint32_t *end = src + width;
 999
1000     while (src < end)
1001     {
1002         __m64 a = load8888 (*mask);
1003         __m64 s = load8888 (*src);
1004         __m64 d = load8888 (*dest);
1005         __m64 sa = expand_alpha (s);
1006
1007         a = pix_multiply (a, sa);
1008         a = negate (a);
1009         d = pix_multiply (d, a);
1010         *dest = store8888 (d);
1011
1012         ++src;
1013         ++dest;
1014         ++mask;
1015     }
1016     _mm_empty ();
1017 }
1018
1019 static void
1020 mmx_combine_atop_ca (pixman_implementation_t *imp,
1021                      pixman_op_t              op,
1022                      uint32_t *               dest,
1023                      const uint32_t *         src,
1024                      const uint32_t *         mask,
1025                      int                      width)
1026 {
1027     const uint32_t *end = src + width;
1028
1029     while (src < end)
1030     {
1031         __m64 a = load8888 (*mask);
1032         __m64 s = load8888 (*src);
1033         __m64 d = load8888 (*dest);
1034         __m64 da = expand_alpha (d);
1035         __m64 sa = expand_alpha (s);
1036
1037         s = pix_multiply (s, a);
1038         a = pix_multiply (a, sa);
1039         a = negate (a);
1040         d = pix_add_mul (d, a, s, da);
1041         *dest = store8888 (d);
1042
1043         ++src;
1044         ++dest;
1045         ++mask;
1046     }
1047     _mm_empty ();
1048 }
1049
1050 static void
1051 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1052                              pixman_op_t              op,
1053                              uint32_t *               dest,
1054                              const uint32_t *         src,
1055                              const uint32_t *         mask,
1056                              int                      width)
1057 {
1058     const uint32_t *end = src + width;
1059
1060     while (src < end)
1061     {
1062         __m64 a = load8888 (*mask);
1063         __m64 s = load8888 (*src);
1064         __m64 d = load8888 (*dest);
1065         __m64 da = expand_alpha (d);
1066         __m64 sa = expand_alpha (s);
1067
1068         s = pix_multiply (s, a);
1069         a = pix_multiply (a, sa);
1070         da = negate (da);
1071         d = pix_add_mul (d, a, s, da);
1072         *dest = store8888 (d);
1073
1074         ++src;
1075         ++dest;
1076         ++mask;
1077     }
1078     _mm_empty ();
1079 }
1080
1081 static void
1082 mmx_combine_xor_ca (pixman_implementation_t *imp,
1083                     pixman_op_t              op,
1084                     uint32_t *               dest,
1085                     const uint32_t *         src,
1086                     const uint32_t *         mask,
1087                     int                      width)
1088 {
1089     const uint32_t *end = src + width;
1090
1091     while (src < end)
1092     {
1093         __m64 a = load8888 (*mask);
1094         __m64 s = load8888 (*src);
1095         __m64 d = load8888 (*dest);
1096         __m64 da = expand_alpha (d);
1097         __m64 sa = expand_alpha (s);
1098
1099         s = pix_multiply (s, a);
1100         a = pix_multiply (a, sa);
1101         da = negate (da);
1102         a = negate (a);
1103         d = pix_add_mul (d, a, s, da);
1104         *dest = store8888 (d);
1105
1106         ++src;
1107         ++dest;
1108         ++mask;
1109     }
1110     _mm_empty ();
1111 }
1112
1113 static void
1114 mmx_combine_add_ca (pixman_implementation_t *imp,
1115                     pixman_op_t              op,
1116                     uint32_t *               dest,
1117                     const uint32_t *         src,
1118                     const uint32_t *         mask,
1119                     int                      width)
1120 {
1121     const uint32_t *end = src + width;
1122
1123     while (src < end)
1124     {
1125         __m64 a = load8888 (*mask);
1126         __m64 s = load8888 (*src);
1127         __m64 d = load8888 (*dest);
1128
1129         s = pix_multiply (s, a);
1130         d = pix_add (s, d);
1131         *dest = store8888 (d);
1132
1133         ++src;
1134         ++dest;
1135         ++mask;
1136     }
1137     _mm_empty ();
1138 }
1139
1140 /* ------------- MMX code paths called from fbpict.c -------------------- */
1141
1142 static void
1143 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1144                            pixman_composite_info_t *info)
1145 {
1146     PIXMAN_COMPOSITE_ARGS (info);
1147     uint32_t src;
1148     uint32_t    *dst_line, *dst;
1149     int32_t w;
1150     int dst_stride;
1151     __m64 vsrc, vsrca;
1152
1153     CHECKPOINT ();
1154
1155     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1156
1157     if (src == 0)
1158         return;
1159
1160     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1161
1162     vsrc = load8888 (src);
1163     vsrca = expand_alpha (vsrc);
1164
1165     while (height--)
1166     {
1167         dst = dst_line;
1168         dst_line += dst_stride;
1169         w = width;
1170
1171         CHECKPOINT ();
1172
1173         while (w && (unsigned long)dst & 7)
1174         {
1175             *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1176
1177             w--;
1178             dst++;
1179         }
1180
1181         while (w >= 2)
1182         {
1183             __m64 vdest;
1184             __m64 dest0, dest1;
1185
1186             vdest = *(__m64 *)dst;
1187
1188             dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1189             dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1190
1191             *(__m64 *)dst = pack8888 (dest0, dest1);
1192
1193             dst += 2;
1194             w -= 2;
1195         }
1196
1197         CHECKPOINT ();
1198
1199         if (w)
1200         {
1201             *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1202         }
1203     }
1204
1205     _mm_empty ();
1206 }
1207
1208 static void
1209 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1210                            pixman_composite_info_t *info)
1211 {
1212     PIXMAN_COMPOSITE_ARGS (info);
1213     uint32_t src;
1214     uint16_t    *dst_line, *dst;
1215     int32_t w;
1216     int dst_stride;
1217     __m64 vsrc, vsrca;
1218
1219     CHECKPOINT ();
1220
1221     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1222
1223     if (src == 0)
1224         return;
1225
1226     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1227
1228     vsrc = load8888 (src);
1229     vsrca = expand_alpha (vsrc);
1230
1231     while (height--)
1232     {
1233         dst = dst_line;
1234         dst_line += dst_stride;
1235         w = width;
1236
1237         CHECKPOINT ();
1238
1239         while (w && (unsigned long)dst & 7)
1240         {
1241             uint64_t d = *dst;
1242             __m64 vdest = expand565 (to_m64 (d), 0);
1243
1244             vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1245             *dst = to_uint64 (vdest);
1246
1247             w--;
1248             dst++;
1249         }
1250
1251         while (w >= 4)
1252         {
1253             __m64 vdest;
1254
1255             vdest = *(__m64 *)dst;
1256
1257             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
1258             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
1259             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
1260             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
1261
1262             *(__m64 *)dst = vdest;
1263
1264             dst += 4;
1265             w -= 4;
1266         }
1267
1268         CHECKPOINT ();
1269
1270         while (w)
1271         {
1272             uint64_t d = *dst;
1273             __m64 vdest = expand565 (to_m64 (d), 0);
1274
1275             vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1276             *dst = to_uint64 (vdest);
1277
1278             w--;
1279             dst++;
1280         }
1281     }
1282
1283     _mm_empty ();
1284 }
1285
1286 static void
1287 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1288                                    pixman_composite_info_t *info)
1289 {
1290     PIXMAN_COMPOSITE_ARGS (info);
1291     uint32_t src;
1292     uint32_t    *dst_line;
1293     uint32_t    *mask_line;
1294     int dst_stride, mask_stride;
1295     __m64 vsrc, vsrca;
1296
1297     CHECKPOINT ();
1298
1299     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1300
1301     if (src == 0)
1302         return;
1303
1304     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1305     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1306
1307     vsrc = load8888 (src);
1308     vsrca = expand_alpha (vsrc);
1309
1310     while (height--)
1311     {
1312         int twidth = width;
1313         uint32_t *p = (uint32_t *)mask_line;
1314         uint32_t *q = (uint32_t *)dst_line;
1315
1316         while (twidth && (unsigned long)q & 7)
1317         {
1318             uint32_t m = *(uint32_t *)p;
1319
1320             if (m)
1321             {
1322                 __m64 vdest = load8888 (*q);
1323                 vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1324                 *q = store8888 (vdest);
1325             }
1326
1327             twidth--;
1328             p++;
1329             q++;
1330         }
1331
1332         while (twidth >= 2)
1333         {
1334             uint32_t m0, m1;
1335             m0 = *p;
1336             m1 = *(p + 1);
1337
1338             if (m0 | m1)
1339             {
1340                 __m64 dest0, dest1;
1341                 __m64 vdest = *(__m64 *)q;
1342
1343                 dest0 = in_over (vsrc, vsrca, load8888 (m0),
1344                                  expand8888 (vdest, 0));
1345                 dest1 = in_over (vsrc, vsrca, load8888 (m1),
1346                                  expand8888 (vdest, 1));
1347
1348                 *(__m64 *)q = pack8888 (dest0, dest1);
1349             }
1350
1351             p += 2;
1352             q += 2;
1353             twidth -= 2;
1354         }
1355
1356         if (twidth)
1357         {
1358             uint32_t m = *(uint32_t *)p;
1359
1360             if (m)
1361             {
1362                 __m64 vdest = load8888 (*q);
1363                 vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1364                 *q = store8888 (vdest);
1365             }
1366
1367             twidth--;
1368             p++;
1369             q++;
1370         }
1371
1372         dst_line += dst_stride;
1373         mask_line += mask_stride;
1374     }
1375
1376     _mm_empty ();
1377 }
1378
1379 static void
1380 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1381                                 pixman_composite_info_t *info)
1382 {
1383     PIXMAN_COMPOSITE_ARGS (info);
1384     uint32_t    *dst_line, *dst;
1385     uint32_t    *src_line, *src;
1386     uint32_t mask;
1387     __m64 vmask;
1388     int dst_stride, src_stride;
1389     int32_t w;
1390
1391     CHECKPOINT ();
1392
1393     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1394     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1395
1396     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1397     mask &= 0xff000000;
1398     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1399     vmask = load8888 (mask);
1400
1401     while (height--)
1402     {
1403         dst = dst_line;
1404         dst_line += dst_stride;
1405         src = src_line;
1406         src_line += src_stride;
1407         w = width;
1408
1409         while (w && (unsigned long)dst & 7)
1410         {
1411             __m64 s = load8888 (*src);
1412             __m64 d = load8888 (*dst);
1413
1414             *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1415
1416             w--;
1417             dst++;
1418             src++;
1419         }
1420
1421         while (w >= 2)
1422         {
1423             __m64 vs = ldq_u((uint64_t *)src);
1424             __m64 vd = *(__m64 *)dst;
1425             __m64 vsrc0 = expand8888 (vs, 0);
1426             __m64 vsrc1 = expand8888 (vs, 1);
1427
1428             *(__m64 *)dst = pack8888 (
1429                 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1430                 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1431
1432             w -= 2;
1433             dst += 2;
1434             src += 2;
1435         }
1436
1437         if (w)
1438         {
1439             __m64 s = load8888 (*src);
1440             __m64 d = load8888 (*dst);
1441
1442             *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1443         }
1444     }
1445
1446     _mm_empty ();
1447 }
1448
1449 static void
1450 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1451                                 pixman_composite_info_t *info)
1452 {
1453     PIXMAN_COMPOSITE_ARGS (info);
1454     uint32_t *dst_line, *dst;
1455     uint32_t *src_line, *src;
1456     uint32_t mask;
1457     __m64 vmask;
1458     int dst_stride, src_stride;
1459     int32_t w;
1460     __m64 srca;
1461
1462     CHECKPOINT ();
1463
1464     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1465     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1466     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1467
1468     mask &= 0xff000000;
1469     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1470     vmask = load8888 (mask);
1471     srca = MC (4x00ff);
1472
1473     while (height--)
1474     {
1475         dst = dst_line;
1476         dst_line += dst_stride;
1477         src = src_line;
1478         src_line += src_stride;
1479         w = width;
1480
1481         while (w && (unsigned long)dst & 7)
1482         {
1483             __m64 s = load8888 (*src | 0xff000000);
1484             __m64 d = load8888 (*dst);
1485
1486             *dst = store8888 (in_over (s, srca, vmask, d));
1487
1488             w--;
1489             dst++;
1490             src++;
1491         }
1492
1493         while (w >= 16)
1494         {
1495             __m64 vd0 = *(__m64 *)(dst + 0);
1496             __m64 vd1 = *(__m64 *)(dst + 2);
1497             __m64 vd2 = *(__m64 *)(dst + 4);
1498             __m64 vd3 = *(__m64 *)(dst + 6);
1499             __m64 vd4 = *(__m64 *)(dst + 8);
1500             __m64 vd5 = *(__m64 *)(dst + 10);
1501             __m64 vd6 = *(__m64 *)(dst + 12);
1502             __m64 vd7 = *(__m64 *)(dst + 14);
1503
1504             __m64 vs0 = ldq_u((uint64_t *)(src + 0));
1505             __m64 vs1 = ldq_u((uint64_t *)(src + 2));
1506             __m64 vs2 = ldq_u((uint64_t *)(src + 4));
1507             __m64 vs3 = ldq_u((uint64_t *)(src + 6));
1508             __m64 vs4 = ldq_u((uint64_t *)(src + 8));
1509             __m64 vs5 = ldq_u((uint64_t *)(src + 10));
1510             __m64 vs6 = ldq_u((uint64_t *)(src + 12));
1511             __m64 vs7 = ldq_u((uint64_t *)(src + 14));
1512
1513             vd0 = pack8888 (
1514                 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1515                 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1516
1517             vd1 = pack8888 (
1518                 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1519                 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1520
1521             vd2 = pack8888 (
1522                 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1523                 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1524
1525             vd3 = pack8888 (
1526                 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1527                 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1528
1529             vd4 = pack8888 (
1530                 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1531                 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1532
1533             vd5 = pack8888 (
1534                 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1535                 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1536
1537             vd6 = pack8888 (
1538                 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1539                 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1540
1541             vd7 = pack8888 (
1542                 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1543                 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1544
1545             *(__m64 *)(dst + 0) = vd0;
1546             *(__m64 *)(dst + 2) = vd1;
1547             *(__m64 *)(dst + 4) = vd2;
1548             *(__m64 *)(dst + 6) = vd3;
1549             *(__m64 *)(dst + 8) = vd4;
1550             *(__m64 *)(dst + 10) = vd5;
1551             *(__m64 *)(dst + 12) = vd6;
1552             *(__m64 *)(dst + 14) = vd7;
1553
1554             w -= 16;
1555             dst += 16;
1556             src += 16;
1557         }
1558
1559         while (w)
1560         {
1561             __m64 s = load8888 (*src | 0xff000000);
1562             __m64 d = load8888 (*dst);
1563
1564             *dst = store8888 (in_over (s, srca, vmask, d));
1565
1566             w--;
1567             dst++;
1568             src++;
1569         }
1570     }
1571
1572     _mm_empty ();
1573 }
1574
1575 static void
1576 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1577                               pixman_composite_info_t *info)
1578 {
1579     PIXMAN_COMPOSITE_ARGS (info);
1580     uint32_t *dst_line, *dst;
1581     uint32_t *src_line, *src;
1582     uint32_t s;
1583     int dst_stride, src_stride;
1584     uint8_t a;
1585     int32_t w;
1586
1587     CHECKPOINT ();
1588
1589     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1590     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1591
1592     while (height--)
1593     {
1594         dst = dst_line;
1595         dst_line += dst_stride;
1596         src = src_line;
1597         src_line += src_stride;
1598         w = width;
1599
1600         while (w--)
1601         {
1602             s = *src++;
1603             a = s >> 24;
1604
1605             if (a == 0xff)
1606             {
1607                 *dst = s;
1608             }
1609             else if (s)
1610             {
1611                 __m64 ms, sa;
1612                 ms = load8888 (s);
1613                 sa = expand_alpha (ms);
1614                 *dst = store8888 (over (ms, sa, load8888 (*dst)));
1615             }
1616
1617             dst++;
1618         }
1619     }
1620     _mm_empty ();
1621 }
1622
1623 static void
1624 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1625                               pixman_composite_info_t *info)
1626 {
1627     PIXMAN_COMPOSITE_ARGS (info);
1628     uint16_t    *dst_line, *dst;
1629     uint32_t    *src_line, *src;
1630     int dst_stride, src_stride;
1631     int32_t w;
1632
1633     CHECKPOINT ();
1634
1635     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1636     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1637
1638 #if 0
1639     /* FIXME */
1640     assert (src_image->drawable == mask_image->drawable);
1641 #endif
1642
1643     while (height--)
1644     {
1645         dst = dst_line;
1646         dst_line += dst_stride;
1647         src = src_line;
1648         src_line += src_stride;
1649         w = width;
1650
1651         CHECKPOINT ();
1652
1653         while (w && (unsigned long)dst & 7)
1654         {
1655             __m64 vsrc = load8888 (*src);
1656             uint64_t d = *dst;
1657             __m64 vdest = expand565 (to_m64 (d), 0);
1658
1659             vdest = pack_565 (
1660                 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1661
1662             *dst = to_uint64 (vdest);
1663
1664             w--;
1665             dst++;
1666             src++;
1667         }
1668
1669         CHECKPOINT ();
1670
1671         while (w >= 4)
1672         {
1673             __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1674             __m64 vdest;
1675
1676             vsrc0 = load8888 (*(src + 0));
1677             vsrc1 = load8888 (*(src + 1));
1678             vsrc2 = load8888 (*(src + 2));
1679             vsrc3 = load8888 (*(src + 3));
1680
1681             vdest = *(__m64 *)dst;
1682
1683             vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
1684             vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
1685             vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
1686             vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
1687
1688             *(__m64 *)dst = vdest;
1689
1690             w -= 4;
1691             dst += 4;
1692             src += 4;
1693         }
1694
1695         CHECKPOINT ();
1696
1697         while (w)
1698         {
1699             __m64 vsrc = load8888 (*src);
1700             uint64_t d = *dst;
1701             __m64 vdest = expand565 (to_m64 (d), 0);
1702
1703             vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1704
1705             *dst = to_uint64 (vdest);
1706
1707             w--;
1708             dst++;
1709             src++;
1710         }
1711     }
1712
1713     _mm_empty ();
1714 }
1715
1716 static void
1717 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1718                              pixman_composite_info_t *info)
1719 {
1720     PIXMAN_COMPOSITE_ARGS (info);
1721     uint32_t src, srca;
1722     uint32_t *dst_line, *dst;
1723     uint8_t *mask_line, *mask;
1724     int dst_stride, mask_stride;
1725     int32_t w;
1726     __m64 vsrc, vsrca;
1727     uint64_t srcsrc;
1728
1729     CHECKPOINT ();
1730
1731     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1732
1733     srca = src >> 24;
1734     if (src == 0)
1735         return;
1736
1737     srcsrc = (uint64_t)src << 32 | src;
1738
1739     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1740     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1741
1742     vsrc = load8888 (src);
1743     vsrca = expand_alpha (vsrc);
1744
1745     while (height--)
1746     {
1747         dst = dst_line;
1748         dst_line += dst_stride;
1749         mask = mask_line;
1750         mask_line += mask_stride;
1751         w = width;
1752
1753         CHECKPOINT ();
1754
1755         while (w && (unsigned long)dst & 7)
1756         {
1757             uint64_t m = *mask;
1758
1759             if (m)
1760             {
1761                 __m64 vdest = in_over (vsrc, vsrca,
1762                                        expand_alpha_rev (to_m64 (m)),
1763                                        load8888 (*dst));
1764
1765                 *dst = store8888 (vdest);
1766             }
1767
1768             w--;
1769             mask++;
1770             dst++;
1771         }
1772
1773         CHECKPOINT ();
1774
1775         while (w >= 2)
1776         {
1777             uint64_t m0, m1;
1778
1779             m0 = *mask;
1780             m1 = *(mask + 1);
1781
1782             if (srca == 0xff && (m0 & m1) == 0xff)
1783             {
1784                 *(uint64_t *)dst = srcsrc;
1785             }
1786             else if (m0 | m1)
1787             {
1788                 __m64 vdest;
1789                 __m64 dest0, dest1;
1790
1791                 vdest = *(__m64 *)dst;
1792
1793                 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
1794                                  expand8888 (vdest, 0));
1795                 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
1796                                  expand8888 (vdest, 1));
1797
1798                 *(__m64 *)dst = pack8888 (dest0, dest1);
1799             }
1800
1801             mask += 2;
1802             dst += 2;
1803             w -= 2;
1804         }
1805
1806         CHECKPOINT ();
1807
1808         if (w)
1809         {
1810             uint64_t m = *mask;
1811
1812             if (m)
1813             {
1814                 __m64 vdest = load8888 (*dst);
1815
1816                 vdest = in_over (
1817                     vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
1818                 *dst = store8888 (vdest);
1819             }
1820         }
1821     }
1822
1823     _mm_empty ();
1824 }
1825
1826 pixman_bool_t
1827 pixman_fill_mmx (uint32_t *bits,
1828                  int       stride,
1829                  int       bpp,
1830                  int       x,
1831                  int       y,
1832                  int       width,
1833                  int       height,
1834                  uint32_t xor)
1835 {
1836     uint64_t fill;
1837     __m64 vfill;
1838     uint32_t byte_width;
1839     uint8_t     *byte_line;
1840
1841 #if defined __GNUC__ && defined USE_X86_MMX
1842     __m64 v1, v2, v3, v4, v5, v6, v7;
1843 #endif
1844
1845     if (bpp != 16 && bpp != 32 && bpp != 8)
1846         return FALSE;
1847
1848     if (bpp == 8)
1849     {
1850         stride = stride * (int) sizeof (uint32_t) / 1;
1851         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
1852         byte_width = width;
1853         stride *= 1;
1854         xor = (xor & 0xff) * 0x01010101;
1855     }
1856     else if (bpp == 16)
1857     {
1858         stride = stride * (int) sizeof (uint32_t) / 2;
1859         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
1860         byte_width = 2 * width;
1861         stride *= 2;
1862         xor = (xor & 0xffff) * 0x00010001;
1863     }
1864     else
1865     {
1866         stride = stride * (int) sizeof (uint32_t) / 4;
1867         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
1868         byte_width = 4 * width;
1869         stride *= 4;
1870     }
1871
1872     fill = ((uint64_t)xor << 32) | xor;
1873     vfill = to_m64 (fill);
1874
1875 #if defined __GNUC__ && defined USE_X86_MMX
1876     __asm__ (
1877         "movq           %7,     %0\n"
1878         "movq           %7,     %1\n"
1879         "movq           %7,     %2\n"
1880         "movq           %7,     %3\n"
1881         "movq           %7,     %4\n"
1882         "movq           %7,     %5\n"
1883         "movq           %7,     %6\n"
1884         : "=&y" (v1), "=&y" (v2), "=&y" (v3),
1885           "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
1886         : "y" (vfill));
1887 #endif
1888
1889     while (height--)
1890     {
1891         int w;
1892         uint8_t *d = byte_line;
1893
1894         byte_line += stride;
1895         w = byte_width;
1896
1897         if (w >= 1 && ((unsigned long)d & 1))
1898         {
1899             *(uint8_t *)d = (xor & 0xff);
1900             w--;
1901             d++;
1902         }
1903
1904         if (w >= 2 && ((unsigned long)d & 3))
1905         {
1906             *(uint16_t *)d = xor;
1907             w -= 2;
1908             d += 2;
1909         }
1910
1911         while (w >= 4 && ((unsigned long)d & 7))
1912         {
1913             *(uint32_t *)d = xor;
1914
1915             w -= 4;
1916             d += 4;
1917         }
1918
1919         while (w >= 64)
1920         {
1921 #if defined __GNUC__ && defined USE_X86_MMX
1922             __asm__ (
1923                 "movq   %1,       (%0)\n"
1924                 "movq   %2,      8(%0)\n"
1925                 "movq   %3,     16(%0)\n"
1926                 "movq   %4,     24(%0)\n"
1927                 "movq   %5,     32(%0)\n"
1928                 "movq   %6,     40(%0)\n"
1929                 "movq   %7,     48(%0)\n"
1930                 "movq   %8,     56(%0)\n"
1931                 :
1932                 : "r" (d),
1933                   "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
1934                   "y" (v4), "y" (v5), "y" (v6), "y" (v7)
1935                 : "memory");
1936 #else
1937             *(__m64*) (d +  0) = vfill;
1938             *(__m64*) (d +  8) = vfill;
1939             *(__m64*) (d + 16) = vfill;
1940             *(__m64*) (d + 24) = vfill;
1941             *(__m64*) (d + 32) = vfill;
1942             *(__m64*) (d + 40) = vfill;
1943             *(__m64*) (d + 48) = vfill;
1944             *(__m64*) (d + 56) = vfill;
1945 #endif
1946             w -= 64;
1947             d += 64;
1948         }
1949
1950         while (w >= 4)
1951         {
1952             *(uint32_t *)d = xor;
1953
1954             w -= 4;
1955             d += 4;
1956         }
1957         if (w >= 2)
1958         {
1959             *(uint16_t *)d = xor;
1960             w -= 2;
1961             d += 2;
1962         }
1963         if (w >= 1)
1964         {
1965             *(uint8_t *)d = (xor & 0xff);
1966             w--;
1967             d++;
1968         }
1969
1970     }
1971
1972     _mm_empty ();
1973     return TRUE;
1974 }
1975
1976 static void
1977 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
1978                             pixman_composite_info_t *info)
1979 {
1980     PIXMAN_COMPOSITE_ARGS (info);
1981     uint32_t src, srca;
1982     uint32_t    *dst_line, *dst;
1983     uint8_t     *mask_line, *mask;
1984     int dst_stride, mask_stride;
1985     int32_t w;
1986     __m64 vsrc;
1987     uint64_t srcsrc;
1988
1989     CHECKPOINT ();
1990
1991     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1992
1993     srca = src >> 24;
1994     if (src == 0)
1995     {
1996         pixman_fill_mmx (dest_image->bits.bits, dest_image->bits.rowstride,
1997                          PIXMAN_FORMAT_BPP (dest_image->bits.format),
1998                          dest_x, dest_y, width, height, 0);
1999         return;
2000     }
2001
2002     srcsrc = (uint64_t)src << 32 | src;
2003
2004     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2005     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2006
2007     vsrc = load8888 (src);
2008
2009     while (height--)
2010     {
2011         dst = dst_line;
2012         dst_line += dst_stride;
2013         mask = mask_line;
2014         mask_line += mask_stride;
2015         w = width;
2016
2017         CHECKPOINT ();
2018
2019         while (w && (unsigned long)dst & 7)
2020         {
2021             uint64_t m = *mask;
2022
2023             if (m)
2024             {
2025                 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2026
2027                 *dst = store8888 (vdest);
2028             }
2029             else
2030             {
2031                 *dst = 0;
2032             }
2033
2034             w--;
2035             mask++;
2036             dst++;
2037         }
2038
2039         CHECKPOINT ();
2040
2041         while (w >= 2)
2042         {
2043             uint64_t m0, m1;
2044             m0 = *mask;
2045             m1 = *(mask + 1);
2046
2047             if (srca == 0xff && (m0 & m1) == 0xff)
2048             {
2049                 *(uint64_t *)dst = srcsrc;
2050             }
2051             else if (m0 | m1)
2052             {
2053                 __m64 dest0, dest1;
2054
2055                 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2056                 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2057
2058                 *(__m64 *)dst = pack8888 (dest0, dest1);
2059             }
2060             else
2061             {
2062                 *(uint64_t *)dst = 0;
2063             }
2064
2065             mask += 2;
2066             dst += 2;
2067             w -= 2;
2068         }
2069
2070         CHECKPOINT ();
2071
2072         if (w)
2073         {
2074             uint64_t m = *mask;
2075
2076             if (m)
2077             {
2078                 __m64 vdest = load8888 (*dst);
2079
2080                 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2081                 *dst = store8888 (vdest);
2082             }
2083             else
2084             {
2085                 *dst = 0;
2086             }
2087         }
2088     }
2089
2090     _mm_empty ();
2091 }
2092
2093 static void
2094 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2095                              pixman_composite_info_t *info)
2096 {
2097     PIXMAN_COMPOSITE_ARGS (info);
2098     uint32_t src, srca;
2099     uint16_t *dst_line, *dst;
2100     uint8_t *mask_line, *mask;
2101     int dst_stride, mask_stride;
2102     int32_t w;
2103     __m64 vsrc, vsrca, tmp;
2104     uint64_t srcsrcsrcsrc, src16;
2105
2106     CHECKPOINT ();
2107
2108     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2109
2110     srca = src >> 24;
2111     if (src == 0)
2112         return;
2113
2114     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2115     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2116
2117     vsrc = load8888 (src);
2118     vsrca = expand_alpha (vsrc);
2119
2120     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2121     src16 = to_uint64 (tmp);
2122
2123     srcsrcsrcsrc =
2124         (uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
2125         (uint64_t)src16 << 16 | (uint64_t)src16;
2126
2127     while (height--)
2128     {
2129         dst = dst_line;
2130         dst_line += dst_stride;
2131         mask = mask_line;
2132         mask_line += mask_stride;
2133         w = width;
2134
2135         CHECKPOINT ();
2136
2137         while (w && (unsigned long)dst & 7)
2138         {
2139             uint64_t m = *mask;
2140
2141             if (m)
2142             {
2143                 uint64_t d = *dst;
2144                 __m64 vd = to_m64 (d);
2145                 __m64 vdest = in_over (
2146                     vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2147
2148                 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2149                 *dst = to_uint64 (vd);
2150             }
2151
2152             w--;
2153             mask++;
2154             dst++;
2155         }
2156
2157         CHECKPOINT ();
2158
2159         while (w >= 4)
2160         {
2161             uint64_t m0, m1, m2, m3;
2162             m0 = *mask;
2163             m1 = *(mask + 1);
2164             m2 = *(mask + 2);
2165             m3 = *(mask + 3);
2166
2167             if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2168             {
2169                 *(uint64_t *)dst = srcsrcsrcsrc;
2170             }
2171             else if (m0 | m1 | m2 | m3)
2172             {
2173                 __m64 vdest;
2174                 __m64 vm0, vm1, vm2, vm3;
2175
2176                 vdest = *(__m64 *)dst;
2177
2178                 vm0 = to_m64 (m0);
2179                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
2180                                            expand565 (vdest, 0)), vdest, 0);
2181                 vm1 = to_m64 (m1);
2182                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
2183                                            expand565 (vdest, 1)), vdest, 1);
2184                 vm2 = to_m64 (m2);
2185                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
2186                                            expand565 (vdest, 2)), vdest, 2);
2187                 vm3 = to_m64 (m3);
2188                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
2189                                            expand565 (vdest, 3)), vdest, 3);
2190
2191                 *(__m64 *)dst = vdest;
2192             }
2193
2194             w -= 4;
2195             mask += 4;
2196             dst += 4;
2197         }
2198
2199         CHECKPOINT ();
2200
2201         while (w)
2202         {
2203             uint64_t m = *mask;
2204
2205             if (m)
2206             {
2207                 uint64_t d = *dst;
2208                 __m64 vd = to_m64 (d);
2209                 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2210                                        expand565 (vd, 0));
2211                 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2212                 *dst = to_uint64 (vd);
2213             }
2214
2215             w--;
2216             mask++;
2217             dst++;
2218         }
2219     }
2220
2221     _mm_empty ();
2222 }
2223
2224 static void
2225 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2226                                 pixman_composite_info_t *info)
2227 {
2228     PIXMAN_COMPOSITE_ARGS (info);
2229     uint16_t    *dst_line, *dst;
2230     uint32_t    *src_line, *src;
2231     int dst_stride, src_stride;
2232     int32_t w;
2233
2234     CHECKPOINT ();
2235
2236     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2237     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2238
2239 #if 0
2240     /* FIXME */
2241     assert (src_image->drawable == mask_image->drawable);
2242 #endif
2243
2244     while (height--)
2245     {
2246         dst = dst_line;
2247         dst_line += dst_stride;
2248         src = src_line;
2249         src_line += src_stride;
2250         w = width;
2251
2252         CHECKPOINT ();
2253
2254         while (w && (unsigned long)dst & 7)
2255         {
2256             __m64 vsrc = load8888 (*src);
2257             uint64_t d = *dst;
2258             __m64 vdest = expand565 (to_m64 (d), 0);
2259
2260             vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2261
2262             *dst = to_uint64 (vdest);
2263
2264             w--;
2265             dst++;
2266             src++;
2267         }
2268
2269         CHECKPOINT ();
2270
2271         while (w >= 4)
2272         {
2273             uint32_t s0, s1, s2, s3;
2274             unsigned char a0, a1, a2, a3;
2275
2276             s0 = *src;
2277             s1 = *(src + 1);
2278             s2 = *(src + 2);
2279             s3 = *(src + 3);
2280
2281             a0 = (s0 >> 24);
2282             a1 = (s1 >> 24);
2283             a2 = (s2 >> 24);
2284             a3 = (s3 >> 24);
2285
2286             if ((a0 & a1 & a2 & a3) == 0xFF)
2287             {
2288                 __m64 vdest;
2289                 vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
2290                 vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
2291                 vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
2292                 vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
2293
2294                 *(__m64 *)dst = vdest;
2295             }
2296             else if (s0 | s1 | s2 | s3)
2297             {
2298                 __m64 vdest = *(__m64 *)dst;
2299
2300                 vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
2301                 vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
2302                 vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
2303                 vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
2304
2305                 *(__m64 *)dst = vdest;
2306             }
2307
2308             w -= 4;
2309             dst += 4;
2310             src += 4;
2311         }
2312
2313         CHECKPOINT ();
2314
2315         while (w)
2316         {
2317             __m64 vsrc = load8888 (*src);
2318             uint64_t d = *dst;
2319             __m64 vdest = expand565 (to_m64 (d), 0);
2320
2321             vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2322
2323             *dst = to_uint64 (vdest);
2324
2325             w--;
2326             dst++;
2327             src++;
2328         }
2329     }
2330
2331     _mm_empty ();
2332 }
2333
2334 static void
2335 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2336                                 pixman_composite_info_t *info)
2337 {
2338     PIXMAN_COMPOSITE_ARGS (info);
2339     uint32_t    *dst_line, *dst;
2340     uint32_t    *src_line, *src;
2341     int dst_stride, src_stride;
2342     int32_t w;
2343
2344     CHECKPOINT ();
2345
2346     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2347     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2348
2349 #if 0
2350     /* FIXME */
2351     assert (src_image->drawable == mask_image->drawable);
2352 #endif
2353
2354     while (height--)
2355     {
2356         dst = dst_line;
2357         dst_line += dst_stride;
2358         src = src_line;
2359         src_line += src_stride;
2360         w = width;
2361
2362         while (w && (unsigned long)dst & 7)
2363         {
2364             __m64 s = load8888 (*src);
2365             __m64 d = load8888 (*dst);
2366
2367             *dst = store8888 (over_rev_non_pre (s, d));
2368
2369             w--;
2370             dst++;
2371             src++;
2372         }
2373
2374         while (w >= 2)
2375         {
2376             uint64_t s0, s1;
2377             unsigned char a0, a1;
2378             __m64 d0, d1;
2379
2380             s0 = *src;
2381             s1 = *(src + 1);
2382
2383             a0 = (s0 >> 24);
2384             a1 = (s1 >> 24);
2385
2386             if ((a0 & a1) == 0xFF)
2387             {
2388                 d0 = invert_colors (load8888 (s0));
2389                 d1 = invert_colors (load8888 (s1));
2390
2391                 *(__m64 *)dst = pack8888 (d0, d1);
2392             }
2393             else if (s0 | s1)
2394             {
2395                 __m64 vdest = *(__m64 *)dst;
2396
2397                 d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
2398                 d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
2399
2400                 *(__m64 *)dst = pack8888 (d0, d1);
2401             }
2402
2403             w -= 2;
2404             dst += 2;
2405             src += 2;
2406         }
2407
2408         if (w)
2409         {
2410             __m64 s = load8888 (*src);
2411             __m64 d = load8888 (*dst);
2412
2413             *dst = store8888 (over_rev_non_pre (s, d));
2414         }
2415     }
2416
2417     _mm_empty ();
2418 }
2419
2420 static void
2421 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2422                                    pixman_composite_info_t *info)
2423 {
2424     PIXMAN_COMPOSITE_ARGS (info);
2425     uint32_t src;
2426     uint16_t    *dst_line;
2427     uint32_t    *mask_line;
2428     int dst_stride, mask_stride;
2429     __m64 vsrc, vsrca;
2430
2431     CHECKPOINT ();
2432
2433     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2434
2435     if (src == 0)
2436         return;
2437
2438     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2439     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2440
2441     vsrc = load8888 (src);
2442     vsrca = expand_alpha (vsrc);
2443
2444     while (height--)
2445     {
2446         int twidth = width;
2447         uint32_t *p = (uint32_t *)mask_line;
2448         uint16_t *q = (uint16_t *)dst_line;
2449
2450         while (twidth && ((unsigned long)q & 7))
2451         {
2452             uint32_t m = *(uint32_t *)p;
2453
2454             if (m)
2455             {
2456                 uint64_t d = *q;
2457                 __m64 vdest = expand565 (to_m64 (d), 0);
2458                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2459                 *q = to_uint64 (vdest);
2460             }
2461
2462             twidth--;
2463             p++;
2464             q++;
2465         }
2466
2467         while (twidth >= 4)
2468         {
2469             uint32_t m0, m1, m2, m3;
2470
2471             m0 = *p;
2472             m1 = *(p + 1);
2473             m2 = *(p + 2);
2474             m3 = *(p + 3);
2475
2476             if ((m0 | m1 | m2 | m3))
2477             {
2478                 __m64 vdest = *(__m64 *)q;
2479
2480                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
2481                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
2482                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
2483                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
2484
2485                 *(__m64 *)q = vdest;
2486             }
2487             twidth -= 4;
2488             p += 4;
2489             q += 4;
2490         }
2491
2492         while (twidth)
2493         {
2494             uint32_t m;
2495
2496             m = *(uint32_t *)p;
2497             if (m)
2498             {
2499                 uint64_t d = *q;
2500                 __m64 vdest = expand565 (to_m64 (d), 0);
2501                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2502                 *q = to_uint64 (vdest);
2503             }
2504
2505             twidth--;
2506             p++;
2507             q++;
2508         }
2509
2510         mask_line += mask_stride;
2511         dst_line += dst_stride;
2512     }
2513
2514     _mm_empty ();
2515 }
2516
2517 static void
2518 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2519                         pixman_composite_info_t *info)
2520 {
2521     PIXMAN_COMPOSITE_ARGS (info);
2522     uint8_t *dst_line, *dst;
2523     uint8_t *mask_line, *mask;
2524     int dst_stride, mask_stride;
2525     int32_t w;
2526     uint32_t src;
2527     uint8_t sa;
2528     __m64 vsrc, vsrca;
2529
2530     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2531     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2532
2533     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2534
2535     sa = src >> 24;
2536
2537     vsrc = load8888 (src);
2538     vsrca = expand_alpha (vsrc);
2539
2540     while (height--)
2541     {
2542         dst = dst_line;
2543         dst_line += dst_stride;
2544         mask = mask_line;
2545         mask_line += mask_stride;
2546         w = width;
2547
2548         while (w && (unsigned long)dst & 7)
2549         {
2550             uint16_t tmp;
2551             uint8_t a;
2552             uint32_t m, d;
2553
2554             a = *mask++;
2555             d = *dst;
2556
2557             m = MUL_UN8 (sa, a, tmp);
2558             d = MUL_UN8 (m, d, tmp);
2559
2560             *dst++ = d;
2561             w--;
2562         }
2563
2564         while (w >= 4)
2565         {
2566             __m64 vmask;
2567             __m64 vdest;
2568
2569             vmask = load8888 (ldl_u((uint32_t *)mask));
2570             vdest = load8888 (*(uint32_t *)dst);
2571
2572             *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
2573
2574             dst += 4;
2575             mask += 4;
2576             w -= 4;
2577         }
2578
2579         while (w--)
2580         {
2581             uint16_t tmp;
2582             uint8_t a;
2583             uint32_t m, d;
2584
2585             a = *mask++;
2586             d = *dst;
2587
2588             m = MUL_UN8 (sa, a, tmp);
2589             d = MUL_UN8 (m, d, tmp);
2590
2591             *dst++ = d;
2592         }
2593     }
2594
2595     _mm_empty ();
2596 }
2597
2598 static void
2599 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2600                       pixman_composite_info_t *info)
2601 {
2602     PIXMAN_COMPOSITE_ARGS (info);
2603     uint8_t     *dst_line, *dst;
2604     uint8_t     *src_line, *src;
2605     int src_stride, dst_stride;
2606     int32_t w;
2607
2608     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2609     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2610
2611     while (height--)
2612     {
2613         dst = dst_line;
2614         dst_line += dst_stride;
2615         src = src_line;
2616         src_line += src_stride;
2617         w = width;
2618
2619         while (w && (unsigned long)dst & 3)
2620         {
2621             uint8_t s, d;
2622             uint16_t tmp;
2623
2624             s = *src;
2625             d = *dst;
2626
2627             *dst = MUL_UN8 (s, d, tmp);
2628
2629             src++;
2630             dst++;
2631             w--;
2632         }
2633
2634         while (w >= 4)
2635         {
2636             uint32_t *s = (uint32_t *)src;
2637             uint32_t *d = (uint32_t *)dst;
2638
2639             *d = store8888 (in (load8888 (ldl_u((uint32_t *)s)), load8888 (*d)));
2640
2641             w -= 4;
2642             dst += 4;
2643             src += 4;
2644         }
2645
2646         while (w--)
2647         {
2648             uint8_t s, d;
2649             uint16_t tmp;
2650
2651             s = *src;
2652             d = *dst;
2653
2654             *dst = MUL_UN8 (s, d, tmp);
2655
2656             src++;
2657             dst++;
2658         }
2659     }
2660
2661     _mm_empty ();
2662 }
2663
2664 static void
2665 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2666                          pixman_composite_info_t *info)
2667 {
2668     PIXMAN_COMPOSITE_ARGS (info);
2669     uint8_t     *dst_line, *dst;
2670     uint8_t     *mask_line, *mask;
2671     int dst_stride, mask_stride;
2672     int32_t w;
2673     uint32_t src;
2674     uint8_t sa;
2675     __m64 vsrc, vsrca;
2676
2677     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2678     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2679
2680     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2681
2682     sa = src >> 24;
2683
2684     if (src == 0)
2685         return;
2686
2687     vsrc = load8888 (src);
2688     vsrca = expand_alpha (vsrc);
2689
2690     while (height--)
2691     {
2692         dst = dst_line;
2693         dst_line += dst_stride;
2694         mask = mask_line;
2695         mask_line += mask_stride;
2696         w = width;
2697
2698         while (w && (unsigned long)dst & 3)
2699         {
2700             uint16_t tmp;
2701             uint16_t a;
2702             uint32_t m, d;
2703             uint32_t r;
2704
2705             a = *mask++;
2706             d = *dst;
2707
2708             m = MUL_UN8 (sa, a, tmp);
2709             r = ADD_UN8 (m, d, tmp);
2710
2711             *dst++ = r;
2712             w--;
2713         }
2714
2715         while (w >= 4)
2716         {
2717             __m64 vmask;
2718             __m64 vdest;
2719
2720             vmask = load8888 (ldl_u((uint32_t *)mask));
2721             vdest = load8888 (*(uint32_t *)dst);
2722
2723             *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
2724
2725             dst += 4;
2726             mask += 4;
2727             w -= 4;
2728         }
2729
2730         while (w--)
2731         {
2732             uint16_t tmp;
2733             uint16_t a;
2734             uint32_t m, d;
2735             uint32_t r;
2736
2737             a = *mask++;
2738             d = *dst;
2739
2740             m = MUL_UN8 (sa, a, tmp);
2741             r = ADD_UN8 (m, d, tmp);
2742
2743             *dst++ = r;
2744         }
2745     }
2746
2747     _mm_empty ();
2748 }
2749
2750 static void
2751 mmx_composite_add_8_8 (pixman_implementation_t *imp,
2752                        pixman_composite_info_t *info)
2753 {
2754     PIXMAN_COMPOSITE_ARGS (info);
2755     uint8_t *dst_line, *dst;
2756     uint8_t *src_line, *src;
2757     int dst_stride, src_stride;
2758     int32_t w;
2759     uint8_t s, d;
2760     uint16_t t;
2761
2762     CHECKPOINT ();
2763
2764     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2765     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2766
2767     while (height--)
2768     {
2769         dst = dst_line;
2770         dst_line += dst_stride;
2771         src = src_line;
2772         src_line += src_stride;
2773         w = width;
2774
2775         while (w && (unsigned long)dst & 7)
2776         {
2777             s = *src;
2778             d = *dst;
2779             t = d + s;
2780             s = t | (0 - (t >> 8));
2781             *dst = s;
2782
2783             dst++;
2784             src++;
2785             w--;
2786         }
2787
2788         while (w >= 8)
2789         {
2790             *(__m64*)dst = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
2791             dst += 8;
2792             src += 8;
2793             w -= 8;
2794         }
2795
2796         while (w)
2797         {
2798             s = *src;
2799             d = *dst;
2800             t = d + s;
2801             s = t | (0 - (t >> 8));
2802             *dst = s;
2803
2804             dst++;
2805             src++;
2806             w--;
2807         }
2808     }
2809
2810     _mm_empty ();
2811 }
2812
2813 static void
2814 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2815                              pixman_composite_info_t *info)
2816 {
2817     PIXMAN_COMPOSITE_ARGS (info);
2818     __m64 dst64;
2819     uint32_t    *dst_line, *dst;
2820     uint32_t    *src_line, *src;
2821     int dst_stride, src_stride;
2822     int32_t w;
2823
2824     CHECKPOINT ();
2825
2826     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2827     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2828
2829     while (height--)
2830     {
2831         dst = dst_line;
2832         dst_line += dst_stride;
2833         src = src_line;
2834         src_line += src_stride;
2835         w = width;
2836
2837         while (w && (unsigned long)dst & 7)
2838         {
2839             *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2840                                                    _mm_cvtsi32_si64 (*dst)));
2841             dst++;
2842             src++;
2843             w--;
2844         }
2845
2846         while (w >= 2)
2847         {
2848             dst64 = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
2849             *(uint64_t*)dst = to_uint64 (dst64);
2850             dst += 2;
2851             src += 2;
2852             w -= 2;
2853         }
2854
2855         if (w)
2856         {
2857             *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2858                                                    _mm_cvtsi32_si64 (*dst)));
2859
2860         }
2861     }
2862
2863     _mm_empty ();
2864 }
2865
2866 static pixman_bool_t
2867 pixman_blt_mmx (uint32_t *src_bits,
2868                 uint32_t *dst_bits,
2869                 int       src_stride,
2870                 int       dst_stride,
2871                 int       src_bpp,
2872                 int       dst_bpp,
2873                 int       src_x,
2874                 int       src_y,
2875                 int       dest_x,
2876                 int       dest_y,
2877                 int       width,
2878                 int       height)
2879 {
2880     uint8_t *   src_bytes;
2881     uint8_t *   dst_bytes;
2882     int byte_width;
2883
2884     if (src_bpp != dst_bpp)
2885         return FALSE;
2886
2887     if (src_bpp == 16)
2888     {
2889         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
2890         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
2891         src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
2892         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
2893         byte_width = 2 * width;
2894         src_stride *= 2;
2895         dst_stride *= 2;
2896     }
2897     else if (src_bpp == 32)
2898     {
2899         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
2900         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
2901         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
2902         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
2903         byte_width = 4 * width;
2904         src_stride *= 4;
2905         dst_stride *= 4;
2906     }
2907     else
2908     {
2909         return FALSE;
2910     }
2911
2912     while (height--)
2913     {
2914         int w;
2915         uint8_t *s = src_bytes;
2916         uint8_t *d = dst_bytes;
2917         src_bytes += src_stride;
2918         dst_bytes += dst_stride;
2919         w = byte_width;
2920
2921         if (w >= 1 && ((unsigned long)d & 1))
2922         {
2923             *(uint8_t *)d = *(uint8_t *)s;
2924             w -= 1;
2925             s += 1;
2926             d += 1;
2927         }
2928
2929         if (w >= 2 && ((unsigned long)d & 3))
2930         {
2931             *(uint16_t *)d = *(uint16_t *)s;
2932             w -= 2;
2933             s += 2;
2934             d += 2;
2935         }
2936
2937         while (w >= 4 && ((unsigned long)d & 7))
2938         {
2939             *(uint32_t *)d = ldl_u((uint32_t *)s);
2940
2941             w -= 4;
2942             s += 4;
2943             d += 4;
2944         }
2945
2946         while (w >= 64)
2947         {
2948 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
2949             __asm__ (
2950                 "movq     (%1),   %%mm0\n"
2951                 "movq    8(%1),   %%mm1\n"
2952                 "movq   16(%1),   %%mm2\n"
2953                 "movq   24(%1),   %%mm3\n"
2954                 "movq   32(%1),   %%mm4\n"
2955                 "movq   40(%1),   %%mm5\n"
2956                 "movq   48(%1),   %%mm6\n"
2957                 "movq   56(%1),   %%mm7\n"
2958
2959                 "movq   %%mm0,    (%0)\n"
2960                 "movq   %%mm1,   8(%0)\n"
2961                 "movq   %%mm2,  16(%0)\n"
2962                 "movq   %%mm3,  24(%0)\n"
2963                 "movq   %%mm4,  32(%0)\n"
2964                 "movq   %%mm5,  40(%0)\n"
2965                 "movq   %%mm6,  48(%0)\n"
2966                 "movq   %%mm7,  56(%0)\n"
2967                 :
2968                 : "r" (d), "r" (s)
2969                 : "memory",
2970                   "%mm0", "%mm1", "%mm2", "%mm3",
2971                   "%mm4", "%mm5", "%mm6", "%mm7");
2972 #else
2973             __m64 v0 = ldq_u((uint64_t *)(s + 0));
2974             __m64 v1 = ldq_u((uint64_t *)(s + 8));
2975             __m64 v2 = ldq_u((uint64_t *)(s + 16));
2976             __m64 v3 = ldq_u((uint64_t *)(s + 24));
2977             __m64 v4 = ldq_u((uint64_t *)(s + 32));
2978             __m64 v5 = ldq_u((uint64_t *)(s + 40));
2979             __m64 v6 = ldq_u((uint64_t *)(s + 48));
2980             __m64 v7 = ldq_u((uint64_t *)(s + 56));
2981             *(__m64 *)(d + 0)  = v0;
2982             *(__m64 *)(d + 8)  = v1;
2983             *(__m64 *)(d + 16) = v2;
2984             *(__m64 *)(d + 24) = v3;
2985             *(__m64 *)(d + 32) = v4;
2986             *(__m64 *)(d + 40) = v5;
2987             *(__m64 *)(d + 48) = v6;
2988             *(__m64 *)(d + 56) = v7;
2989 #endif
2990
2991             w -= 64;
2992             s += 64;
2993             d += 64;
2994         }
2995         while (w >= 4)
2996         {
2997             *(uint32_t *)d = ldl_u((uint32_t *)s);
2998
2999             w -= 4;
3000             s += 4;
3001             d += 4;
3002         }
3003         if (w >= 2)
3004         {
3005             *(uint16_t *)d = *(uint16_t *)s;
3006             w -= 2;
3007             s += 2;
3008             d += 2;
3009         }
3010     }
3011
3012     _mm_empty ();
3013
3014     return TRUE;
3015 }
3016
3017 static void
3018 mmx_composite_copy_area (pixman_implementation_t *imp,
3019                          pixman_composite_info_t *info)
3020 {
3021     PIXMAN_COMPOSITE_ARGS (info);
3022
3023     pixman_blt_mmx (src_image->bits.bits,
3024                     dest_image->bits.bits,
3025                     src_image->bits.rowstride,
3026                     dest_image->bits.rowstride,
3027                     PIXMAN_FORMAT_BPP (src_image->bits.format),
3028                     PIXMAN_FORMAT_BPP (dest_image->bits.format),
3029                     src_x, src_y, dest_x, dest_y, width, height);
3030 }
3031
3032 #ifdef USE_ARM_IWMMXT
3033 static void
3034 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3035                                 pixman_composite_info_t *info)
3036 {
3037     PIXMAN_COMPOSITE_ARGS (info);
3038     uint32_t  *src, *src_line;
3039     uint32_t  *dst, *dst_line;
3040     uint8_t  *mask, *mask_line;
3041     int src_stride, mask_stride, dst_stride;
3042     int32_t w;
3043
3044     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3045     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3046     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3047
3048     while (height--)
3049     {
3050         src = src_line;
3051         src_line += src_stride;
3052         dst = dst_line;
3053         dst_line += dst_stride;
3054         mask = mask_line;
3055         mask_line += mask_stride;
3056
3057         w = width;
3058
3059         while (w--)
3060         {
3061             uint64_t m = *mask;
3062
3063             if (m)
3064             {
3065                 __m64 s = load8888 (*src | 0xff000000);
3066
3067                 if (m == 0xff)
3068                 {
3069                     *dst = store8888 (s);
3070                 }
3071                 else
3072                 {
3073                     __m64 sa = expand_alpha (s);
3074                     __m64 vm = expand_alpha_rev (to_m64 (m));
3075                     __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
3076
3077                     *dst = store8888 (vdest);
3078                 }
3079             }
3080
3081             mask++;
3082             dst++;
3083             src++;
3084         }
3085     }
3086
3087     _mm_empty ();
3088 }
3089 #endif
3090
3091 static const pixman_fast_path_t mmx_fast_paths[] =
3092 {
3093     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
3094     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
3095     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
3096     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
3097     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
3098     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
3099     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3100     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3101     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
3102     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3103     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3104     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
3105     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3106     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3107     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
3108     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3109     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3110     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
3111     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
3112     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
3113     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
3114     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
3115     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
3116     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
3117     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
3118     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
3119 #ifdef USE_ARM_IWMMXT
3120     /* FIXME: This code is commented out since it's apparently
3121      * not actually faster than the generic code on x86.
3122      */
3123     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
3124     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
3125     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
3126     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
3127 #endif
3128     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
3129     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
3130     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
3131     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3132     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3133
3134     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
3135     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
3136     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
3137     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
3138     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
3139     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
3140
3141     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
3142     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
3143     PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8             ),
3144     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
3145
3146     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
3147     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
3148     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
3149     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
3150     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
3151     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
3152     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3153     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3154     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3155     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3156     PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
3157     PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
3158
3159     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
3160     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
3161
3162     { PIXMAN_OP_NONE },
3163 };
3164
3165 static pixman_bool_t
3166 mmx_blt (pixman_implementation_t *imp,
3167          uint32_t *               src_bits,
3168          uint32_t *               dst_bits,
3169          int                      src_stride,
3170          int                      dst_stride,
3171          int                      src_bpp,
3172          int                      dst_bpp,
3173          int                      src_x,
3174          int                      src_y,
3175          int                      dest_x,
3176          int                      dest_y,
3177          int                      width,
3178          int                      height)
3179 {
3180     if (!pixman_blt_mmx (
3181             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3182             src_x, src_y, dest_x, dest_y, width, height))
3183
3184     {
3185         return _pixman_implementation_blt (
3186             imp->delegate,
3187             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3188             src_x, src_y, dest_x, dest_y, width, height);
3189     }
3190
3191     return TRUE;
3192 }
3193
3194 static pixman_bool_t
3195 mmx_fill (pixman_implementation_t *imp,
3196           uint32_t *               bits,
3197           int                      stride,
3198           int                      bpp,
3199           int                      x,
3200           int                      y,
3201           int                      width,
3202           int                      height,
3203           uint32_t xor)
3204 {
3205     if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
3206     {
3207         return _pixman_implementation_fill (
3208             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
3209     }
3210
3211     return TRUE;
3212 }
3213
3214 pixman_implementation_t *
3215 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
3216 {
3217     pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
3218
3219     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
3220     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
3221     imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
3222     imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
3223     imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
3224     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
3225     imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
3226     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
3227     imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
3228     imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
3229     imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
3230
3231     imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
3232     imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
3233     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
3234     imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
3235     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
3236     imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
3237     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
3238     imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
3239     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
3240     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
3241     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
3242
3243     imp->blt = mmx_blt;
3244     imp->fill = mmx_fill;
3245
3246     return imp;
3247 }
3248
3249 #endif /* USE_X86_MMX || USE_ARM_IWMMXT */