pixman/pixman-mmx.c

   1 /*
   2  * Copyright © 2004, 2005 Red Hat, Inc.
   3  * Copyright © 2004 Nicholas Miell
   4  * Copyright © 2005 Trolltech AS
   5  *
   6  * Permission to use, copy, modify, distribute, and sell this software and its
   7  * documentation for any purpose is hereby granted without fee, provided that
   8  * the above copyright notice appear in all copies and that both that
   9  * copyright notice and this permission notice appear in supporting
  10  * documentation, and that the name of Red Hat not be used in advertising or
  11  * publicity pertaining to distribution of the software without specific,
  12  * written prior permission.  Red Hat makes no representations about the
  13  * suitability of this software for any purpose.  It is provided "as is"
  14  * without express or implied warranty.
  15  *
  16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  23  * SOFTWARE.
  24  *
  25  * Author:  Søren Sandmann (sandmann@redhat.com)
  26  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
  27  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
  28  *
  29  * Based on work by Owen Taylor
  30  */
  31
  32 #ifdef HAVE_CONFIG_H
  33 #include <config.h>
  34 #endif
  35
  36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
  37
  38 #ifdef USE_LOONGSON_MMI
  39 #include <loongson-mmintrin.h>
  40 #else
  41 #include <mmintrin.h>
  42 #endif
  43 #include "pixman-private.h"
  44 #include "pixman-combine32.h"
  45
  46 #define no_vERBOSE
  47
  48 #ifdef VERBOSE
  49 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
  50 #else
  51 #define CHECKPOINT()
  52 #endif
  53
  54 #ifdef USE_ARM_IWMMXT
  55 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
  56 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  57 _mm_empty (void)
  58 {
  59
  60 }
  61 #endif
  62
  63 #ifdef USE_X86_MMX
  64 # if (defined(__SUNPRO_C) || defined(_MSC_VER))
  65 #  include <xmmintrin.h>
  66 # else
  67 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
  68  * instructions to be generated that we don't want. Just duplicate the
  69  * functions we want to use.  */
  70 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  71 _mm_movemask_pi8 (__m64 __A)
  72 {
  73     int ret;
  74
  75     asm ("pmovmskb %1, %0\n\t"
  76         : "=r" (ret)
  77         : "y" (__A)
  78     );
  79
  80     return ret;
  81 }
  82
  83 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  84 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
  85 {
  86     asm ("pmulhuw %1, %0\n\t"
  87         : "+y" (__A)
  88         : "y" (__B)
  89     );
  90     return __A;
  91 }
  92
  93 #  ifdef __OPTIMIZE__
  94 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  95 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
  96 {
  97     __m64 ret;
  98
  99     asm ("pshufw %2, %1, %0\n\t"
 100         : "=y" (ret)
 101         : "y" (__A), "K" (__N)
 102     );
 103
 104     return ret;
 105 }
 106 #  else
 107 #   define _mm_shuffle_pi16(A, N) \
 108     ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
 109 #  endif
 110 # endif
 111 #endif
 112
 113 #ifndef _MSC_VER
 114 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
 115  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
 116 #endif
 117
 118 /* Notes about writing mmx code
 119  *
 120  * give memory operands as the second operand. If you give it as the
 121  * first, gcc will first load it into a register, then use that
 122  * register
 123  *
 124  *   ie. use
 125  *
 126  *         _mm_mullo_pi16 (x, mmx_constant);
 127  *
 128  *   not
 129  *
 130  *         _mm_mullo_pi16 (mmx_constant, x);
 131  *
 132  * Also try to minimize dependencies. i.e. when you need a value, try
 133  * to calculate it from a value that was calculated as early as
 134  * possible.
 135  */
 136
 137 /* --------------- MMX primitives ------------------------------------- */
 138
 139 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
 140  * the name of the member used to access the data.
 141  * If __m64 requires using mm_cvt* intrinsics functions to convert between
 142  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
 143  * If __m64 and uint64_t values can just be cast to each other directly,
 144  * then define USE_M64_CASTS.
 145  * If __m64 is a double datatype, then define USE_M64_DOUBLE.
 146  */
 147 #ifdef _MSC_VER
 148 # define M64_MEMBER m64_u64
 149 #elif defined(__ICC)
 150 # define USE_CVT_INTRINSICS
 151 #elif defined(USE_LOONGSON_MMI)
 152 # define USE_M64_DOUBLE
 153 #elif defined(__GNUC__)
 154 # define USE_M64_CASTS
 155 #elif defined(__SUNPRO_C)
 156 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
 157 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
 158  * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
 159  * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
 160  */
 161 #  define USE_CVT_INTRINSICS
 162 # else
 163 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
 164  * disabled, __m64 is defined as a struct containing "unsigned long long l_".
 165  */
 166 #  define M64_MEMBER l_
 167 # endif
 168 #endif
 169
 170 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
 171 typedef uint64_t mmxdatafield;
 172 #else
 173 typedef __m64 mmxdatafield;
 174 #endif
 175
 176 typedef struct
 177 {
 178     mmxdatafield mmx_4x00ff;
 179     mmxdatafield mmx_4x0080;
 180     mmxdatafield mmx_565_rgb;
 181     mmxdatafield mmx_565_unpack_multiplier;
 182     mmxdatafield mmx_565_pack_multiplier;
 183     mmxdatafield mmx_565_r;
 184     mmxdatafield mmx_565_g;
 185     mmxdatafield mmx_565_b;
 186     mmxdatafield mmx_packed_565_rb;
 187     mmxdatafield mmx_packed_565_g;
 188     mmxdatafield mmx_expand_565_g;
 189     mmxdatafield mmx_expand_565_b;
 190     mmxdatafield mmx_expand_565_r;
 191 #ifndef USE_LOONGSON_MMI
 192     mmxdatafield mmx_mask_0;
 193     mmxdatafield mmx_mask_1;
 194     mmxdatafield mmx_mask_2;
 195     mmxdatafield mmx_mask_3;
 196 #endif
 197     mmxdatafield mmx_full_alpha;
 198     mmxdatafield mmx_4x0101;
 199     mmxdatafield mmx_ff000000;
 200 } mmx_data_t;
 201
 202 #if defined(_MSC_VER)
 203 # define MMXDATA_INIT(field, val) { val ## UI64 }
 204 #elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
 205 # define MMXDATA_INIT(field, val) field =   { val ## ULL }
 206 #else                           /* mmxdatafield is an integral type */
 207 # define MMXDATA_INIT(field, val) field =   val ## ULL
 208 #endif
 209
 210 static const mmx_data_t c =
 211 {
 212     MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
 213     MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
 214     MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
 215     MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
 216     MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
 217     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
 218     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
 219     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
 220     MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
 221     MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
 222     MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
 223     MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
 224     MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
 225 #ifndef USE_LOONGSON_MMI
 226     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
 227     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
 228     MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
 229     MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
 230 #endif
 231     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
 232     MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
 233     MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
 234 };
 235
 236 #ifdef USE_CVT_INTRINSICS
 237 #    define MC(x) to_m64 (c.mmx_ ## x)
 238 #elif defined(USE_M64_CASTS)
 239 #    define MC(x) ((__m64)c.mmx_ ## x)
 240 #elif defined(USE_M64_DOUBLE)
 241 #    define MC(x) (*(__m64 *)&c.mmx_ ## x)
 242 #else
 243 #    define MC(x) c.mmx_ ## x
 244 #endif
 245
 246 static force_inline __m64
 247 to_m64 (uint64_t x)
 248 {
 249 #ifdef USE_CVT_INTRINSICS
 250     return _mm_cvtsi64_m64 (x);
 251 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
 252     __m64 res;
 253
 254     res.M64_MEMBER = x;
 255     return res;
 256 #elif defined USE_M64_DOUBLE
 257     return *(__m64 *)&x;
 258 #else /* USE_M64_CASTS */
 259     return (__m64)x;
 260 #endif
 261 }
 262
 263 static force_inline uint64_t
 264 to_uint64 (__m64 x)
 265 {
 266 #ifdef USE_CVT_INTRINSICS
 267     return _mm_cvtm64_si64 (x);
 268 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
 269     uint64_t res = x.M64_MEMBER;
 270     return res;
 271 #elif defined USE_M64_DOUBLE
 272     return *(uint64_t *)&x;
 273 #else /* USE_M64_CASTS */
 274     return (uint64_t)x;
 275 #endif
 276 }
 277
 278 static force_inline __m64
 279 shift (__m64 v,
 280        int   s)
 281 {
 282     if (s > 0)
 283         return _mm_slli_si64 (v, s);
 284     else if (s < 0)
 285         return _mm_srli_si64 (v, -s);
 286     else
 287         return v;
 288 }
 289
 290 static force_inline __m64
 291 negate (__m64 mask)
 292 {
 293     return _mm_xor_si64 (mask, MC (4x00ff));
 294 }
 295
 296 static force_inline __m64
 297 pix_multiply (__m64 a, __m64 b)
 298 {
 299     __m64 res;
 300
 301     res = _mm_mullo_pi16 (a, b);
 302     res = _mm_adds_pu16 (res, MC (4x0080));
 303     res = _mm_mulhi_pu16 (res, MC (4x0101));
 304
 305     return res;
 306 }
 307
 308 static force_inline __m64
 309 pix_add (__m64 a, __m64 b)
 310 {
 311     return _mm_adds_pu8 (a, b);
 312 }
 313
 314 static force_inline __m64
 315 expand_alpha (__m64 pixel)
 316 {
 317     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
 318 }
 319
 320 static force_inline __m64
 321 expand_alpha_rev (__m64 pixel)
 322 {
 323     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
 324 }
 325
 326 static force_inline __m64
 327 invert_colors (__m64 pixel)
 328 {
 329     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
 330 }
 331
 332 static force_inline __m64
 333 over (__m64 src,
 334       __m64 srca,
 335       __m64 dest)
 336 {
 337     return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
 338 }
 339
 340 static force_inline __m64
 341 over_rev_non_pre (__m64 src, __m64 dest)
 342 {
 343     __m64 srca = expand_alpha (src);
 344     __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
 345
 346     return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
 347 }
 348
 349 static force_inline __m64
 350 in (__m64 src, __m64 mask)
 351 {
 352     return pix_multiply (src, mask);
 353 }
 354
 355 #ifndef _MSC_VER
 356 static force_inline __m64
 357 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
 358 {
 359     return over (in (src, mask), pix_multiply (srca, mask), dest);
 360 }
 361
 362 #else
 363
 364 #define in_over(src, srca, mask, dest)                                  \
 365     over (in (src, mask), pix_multiply (srca, mask), dest)
 366
 367 #endif
 368
 369 /* Elemental unaligned loads */
 370
 371 static force_inline __m64 ldq_u(__m64 *p)
 372 {
 373 #ifdef USE_X86_MMX
 374     /* x86's alignment restrictions are very relaxed. */
 375     return *(__m64 *)p;
 376 #elif defined USE_ARM_IWMMXT
 377     int align = (uintptr_t)p & 7;
 378     __m64 *aligned_p;
 379     if (align == 0)
 380         return *p;
 381     aligned_p = (__m64 *)((uintptr_t)p & ~7);
 382     return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
 383 #else
 384     struct __una_u64 { __m64 x __attribute__((packed)); };
 385     const struct __una_u64 *ptr = (const struct __una_u64 *) p;
 386     return (__m64) ptr->x;
 387 #endif
 388 }
 389
 390 static force_inline uint32_t ldl_u(const uint32_t *p)
 391 {
 392 #ifdef USE_X86_MMX
 393     /* x86's alignment restrictions are very relaxed. */
 394     return *p;
 395 #else
 396     struct __una_u32 { uint32_t x __attribute__((packed)); };
 397     const struct __una_u32 *ptr = (const struct __una_u32 *) p;
 398     return ptr->x;
 399 #endif
 400 }
 401
 402 static force_inline __m64
 403 load (const uint32_t *v)
 404 {
 405 #ifdef USE_LOONGSON_MMI
 406     __m64 ret;
 407     asm ("lwc1 %0, %1\n\t"
 408         : "=f" (ret)
 409         : "m" (*v)
 410     );
 411     return ret;
 412 #else
 413     return _mm_cvtsi32_si64 (*v);
 414 #endif
 415 }
 416
 417 static force_inline __m64
 418 load8888 (const uint32_t *v)
 419 {
 420 #ifdef USE_LOONGSON_MMI
 421     return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
 422 #else
 423     return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
 424 #endif
 425 }
 426
 427 static force_inline __m64
 428 load8888u (const uint32_t *v)
 429 {
 430     uint32_t l = ldl_u (v);
 431     return load8888 (&l);
 432 }
 433
 434 static force_inline __m64
 435 pack8888 (__m64 lo, __m64 hi)
 436 {
 437     return _mm_packs_pu16 (lo, hi);
 438 }
 439
 440 static force_inline void
 441 store (uint32_t *dest, __m64 v)
 442 {
 443 #ifdef USE_LOONGSON_MMI
 444     asm ("swc1 %1, %0\n\t"
 445         : "=m" (*dest)
 446         : "f" (v)
 447         : "memory"
 448     );
 449 #else
 450     *dest = _mm_cvtsi64_si32 (v);
 451 #endif
 452 }
 453
 454 static force_inline void
 455 store8888 (uint32_t *dest, __m64 v)
 456 {
 457     v = pack8888 (v, _mm_setzero_si64 ());
 458     store (dest, v);
 459 }
 460
 461 static force_inline pixman_bool_t
 462 is_equal (__m64 a, __m64 b)
 463 {
 464 #ifdef USE_LOONGSON_MMI
 465     /* __m64 is double, we can compare directly. */
 466     return a == b;
 467 #else
 468     return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
 469 #endif
 470 }
 471
 472 static force_inline pixman_bool_t
 473 is_opaque (__m64 v)
 474 {
 475 #ifdef USE_LOONGSON_MMI
 476     return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
 477 #else
 478     __m64 ffs = _mm_cmpeq_pi8 (v, v);
 479     return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
 480 #endif
 481 }
 482
 483 static force_inline pixman_bool_t
 484 is_zero (__m64 v)
 485 {
 486     return is_equal (v, _mm_setzero_si64 ());
 487 }
 488
 489 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
 490  *
 491  *    00RR00GG00BB
 492  *
 493  * --- Expanding 565 in the low word ---
 494  *
 495  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
 496  * m = m & (01f0003f001f);
 497  * m = m * (008404100840);
 498  * m = m >> 8;
 499  *
 500  * Note the trick here - the top word is shifted by another nibble to
 501  * avoid it bumping into the middle word
 502  */
 503 static force_inline __m64
 504 expand565 (__m64 pixel, int pos)
 505 {
 506     __m64 p = pixel;
 507     __m64 t1, t2;
 508
 509     /* move pixel to low 16 bit and zero the rest */
 510 #ifdef USE_LOONGSON_MMI
 511     p = loongson_extract_pi16 (p, pos);
 512 #else
 513     p = shift (shift (p, (3 - pos) * 16), -48);
 514 #endif
 515
 516     t1 = shift (p, 36 - 11);
 517     t2 = shift (p, 16 - 5);
 518
 519     p = _mm_or_si64 (t1, p);
 520     p = _mm_or_si64 (t2, p);
 521     p = _mm_and_si64 (p, MC (565_rgb));
 522
 523     pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
 524     return _mm_srli_pi16 (pixel, 8);
 525 }
 526
 527 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
 528  *
 529  *    AARRGGBBRRGGBB
 530  */
 531 static force_inline void
 532 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
 533 {
 534     __m64 t0, t1, alpha = _mm_setzero_si64 ();;
 535     __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
 536     __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
 537     __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
 538     if (full_alpha)
 539         alpha = _mm_cmpeq_pi32 (alpha, alpha);
 540
 541     /* Replicate high bits into empty low bits. */
 542     r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
 543     g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
 544     b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
 545
 546     r = _mm_packs_pu16 (r, _mm_setzero_si64 ());        /* 00 00 00 00 R3 R2 R1 R0 */
 547     g = _mm_packs_pu16 (g, _mm_setzero_si64 ());        /* 00 00 00 00 G3 G2 G1 G0 */
 548     b = _mm_packs_pu16 (b, _mm_setzero_si64 ());        /* 00 00 00 00 B3 B2 B1 B0 */
 549
 550     t1 = _mm_unpacklo_pi8 (r, alpha);                   /* A3 R3 A2 R2 A1 R1 A0 R0 */
 551     t0 = _mm_unpacklo_pi8 (b, g);                       /* G3 B3 G2 B2 G1 B1 G0 B0 */
 552
 553     *vout0 = _mm_unpacklo_pi16 (t0, t1);                /* A1 R1 G1 B1 A0 R0 G0 B0 */
 554     *vout1 = _mm_unpackhi_pi16 (t0, t1);                /* A3 R3 G3 B3 A2 R2 G2 B2 */
 555 }
 556
 557 static force_inline __m64
 558 expand8888 (__m64 in, int pos)
 559 {
 560     if (pos == 0)
 561         return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
 562     else
 563         return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
 564 }
 565
 566 static force_inline __m64
 567 expandx888 (__m64 in, int pos)
 568 {
 569     return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
 570 }
 571
 572 static force_inline void
 573 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
 574 {
 575     __m64 v0, v1;
 576     expand_4xpacked565 (vin, &v0, &v1, full_alpha);
 577     *vout0 = expand8888 (v0, 0);
 578     *vout1 = expand8888 (v0, 1);
 579     *vout2 = expand8888 (v1, 0);
 580     *vout3 = expand8888 (v1, 1);
 581 }
 582
 583 static force_inline __m64
 584 pack_565 (__m64 pixel, __m64 target, int pos)
 585 {
 586     __m64 p = pixel;
 587     __m64 t = target;
 588     __m64 r, g, b;
 589
 590     r = _mm_and_si64 (p, MC (565_r));
 591     g = _mm_and_si64 (p, MC (565_g));
 592     b = _mm_and_si64 (p, MC (565_b));
 593
 594 #ifdef USE_LOONGSON_MMI
 595     r = shift (r, -(32 - 8));
 596     g = shift (g, -(16 - 3));
 597     b = shift (b, -(0  + 3));
 598
 599     p = _mm_or_si64 (r, g);
 600     p = _mm_or_si64 (p, b);
 601     return loongson_insert_pi16 (t, p, pos);
 602 #else
 603     r = shift (r, -(32 - 8) + pos * 16);
 604     g = shift (g, -(16 - 3) + pos * 16);
 605     b = shift (b, -(0  + 3) + pos * 16);
 606
 607     if (pos == 0)
 608         t = _mm_and_si64 (t, MC (mask_0));
 609     else if (pos == 1)
 610         t = _mm_and_si64 (t, MC (mask_1));
 611     else if (pos == 2)
 612         t = _mm_and_si64 (t, MC (mask_2));
 613     else if (pos == 3)
 614         t = _mm_and_si64 (t, MC (mask_3));
 615
 616     p = _mm_or_si64 (r, t);
 617     p = _mm_or_si64 (g, p);
 618
 619     return _mm_or_si64 (b, p);
 620 #endif
 621 }
 622
 623 static force_inline __m64
 624 pack_4xpacked565 (__m64 a, __m64 b)
 625 {
 626     __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
 627     __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
 628
 629     __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
 630     __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
 631
 632     __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
 633     __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
 634
 635     t0 = _mm_or_si64 (t0, g0);
 636     t1 = _mm_or_si64 (t1, g1);
 637
 638     t0 = shift(t0, -5);
 639 #ifdef USE_ARM_IWMMXT
 640     t1 = shift(t1, -5);
 641     return _mm_packs_pu32 (t0, t1);
 642 #else
 643     t1 = shift(t1, -5 + 16);
 644     return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
 645 #endif
 646 }
 647
 648 #ifndef _MSC_VER
 649
 650 static force_inline __m64
 651 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
 652 {
 653     return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
 654 }
 655
 656 static force_inline __m64
 657 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 658 {
 659     x = pix_multiply (x, a);
 660     y = pix_multiply (y, b);
 661
 662     return pix_add (x, y);
 663 }
 664
 665 #else
 666
 667 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
 668
 669 #define pack_4x565(v0, v1, v2, v3) \
 670     pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
 671
 672 #define pix_add_mul(x, a, y, b)  \
 673     ( x = pix_multiply (x, a),   \
 674       y = pix_multiply (y, b),   \
 675       pix_add (x, y) )
 676
 677 #endif
 678
 679 /* --------------- MMX code patch for fbcompose.c --------------------- */
 680
 681 static force_inline __m64
 682 combine (const uint32_t *src, const uint32_t *mask)
 683 {
 684     __m64 vsrc = load8888 (src);
 685
 686     if (mask)
 687     {
 688         __m64 m = load8888 (mask);
 689
 690         m = expand_alpha (m);
 691         vsrc = pix_multiply (vsrc, m);
 692     }
 693
 694     return vsrc;
 695 }
 696
 697 static void
 698 mmx_combine_over_u (pixman_implementation_t *imp,
 699                     pixman_op_t              op,
 700                     uint32_t *               dest,
 701                     const uint32_t *         src,
 702                     const uint32_t *         mask,
 703                     int                      width)
 704 {
 705     const uint32_t *end = dest + width;
 706
 707     while (dest < end)
 708     {
 709         __m64 vsrc = combine (src, mask);
 710
 711         if (is_opaque (vsrc))
 712         {
 713             store8888 (dest, vsrc);
 714         }
 715         else if (!is_zero (vsrc))
 716         {
 717             __m64 sa = expand_alpha (vsrc);
 718             store8888 (dest, over (vsrc, sa, load8888 (dest)));
 719         }
 720
 721         ++dest;
 722         ++src;
 723         if (mask)
 724             ++mask;
 725     }
 726     _mm_empty ();
 727 }
 728
 729 static void
 730 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
 731                             pixman_op_t              op,
 732                             uint32_t *               dest,
 733                             const uint32_t *         src,
 734                             const uint32_t *         mask,
 735                             int                      width)
 736 {
 737     const uint32_t *end = dest + width;
 738
 739     while (dest < end)
 740     {
 741         __m64 d, da;
 742         __m64 s = combine (src, mask);
 743
 744         d = load8888 (dest);
 745         da = expand_alpha (d);
 746         store8888 (dest, over (d, da, s));
 747
 748         ++dest;
 749         ++src;
 750         if (mask)
 751             mask++;
 752     }
 753     _mm_empty ();
 754 }
 755
 756 static void
 757 mmx_combine_in_u (pixman_implementation_t *imp,
 758                   pixman_op_t              op,
 759                   uint32_t *               dest,
 760                   const uint32_t *         src,
 761                   const uint32_t *         mask,
 762                   int                      width)
 763 {
 764     const uint32_t *end = dest + width;
 765
 766     while (dest < end)
 767     {
 768         __m64 a;
 769         __m64 x = combine (src, mask);
 770
 771         a = load8888 (dest);
 772         a = expand_alpha (a);
 773         x = pix_multiply (x, a);
 774
 775         store8888 (dest, x);
 776
 777         ++dest;
 778         ++src;
 779         if (mask)
 780             mask++;
 781     }
 782     _mm_empty ();
 783 }
 784
 785 static void
 786 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
 787                           pixman_op_t              op,
 788                           uint32_t *               dest,
 789                           const uint32_t *         src,
 790                           const uint32_t *         mask,
 791                           int                      width)
 792 {
 793     const uint32_t *end = dest + width;
 794
 795     while (dest < end)
 796     {
 797         __m64 a = combine (src, mask);
 798         __m64 x;
 799
 800         x = load8888 (dest);
 801         a = expand_alpha (a);
 802         x = pix_multiply (x, a);
 803         store8888 (dest, x);
 804
 805         ++dest;
 806         ++src;
 807         if (mask)
 808             mask++;
 809     }
 810     _mm_empty ();
 811 }
 812
 813 static void
 814 mmx_combine_out_u (pixman_implementation_t *imp,
 815                    pixman_op_t              op,
 816                    uint32_t *               dest,
 817                    const uint32_t *         src,
 818                    const uint32_t *         mask,
 819                    int                      width)
 820 {
 821     const uint32_t *end = dest + width;
 822
 823     while (dest < end)
 824     {
 825         __m64 a;
 826         __m64 x = combine (src, mask);
 827
 828         a = load8888 (dest);
 829         a = expand_alpha (a);
 830         a = negate (a);
 831         x = pix_multiply (x, a);
 832         store8888 (dest, x);
 833
 834         ++dest;
 835         ++src;
 836         if (mask)
 837             mask++;
 838     }
 839     _mm_empty ();
 840 }
 841
 842 static void
 843 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
 844                            pixman_op_t              op,
 845                            uint32_t *               dest,
 846                            const uint32_t *         src,
 847                            const uint32_t *         mask,
 848                            int                      width)
 849 {
 850     const uint32_t *end = dest + width;
 851
 852     while (dest < end)
 853     {
 854         __m64 a = combine (src, mask);
 855         __m64 x;
 856
 857         x = load8888 (dest);
 858         a = expand_alpha (a);
 859         a = negate (a);
 860         x = pix_multiply (x, a);
 861
 862         store8888 (dest, x);
 863
 864         ++dest;
 865         ++src;
 866         if (mask)
 867             mask++;
 868     }
 869     _mm_empty ();
 870 }
 871
 872 static void
 873 mmx_combine_atop_u (pixman_implementation_t *imp,
 874                     pixman_op_t              op,
 875                     uint32_t *               dest,
 876                     const uint32_t *         src,
 877                     const uint32_t *         mask,
 878                     int                      width)
 879 {
 880     const uint32_t *end = dest + width;
 881
 882     while (dest < end)
 883     {
 884         __m64 da, d, sia;
 885         __m64 s = combine (src, mask);
 886
 887         d = load8888 (dest);
 888         sia = expand_alpha (s);
 889         sia = negate (sia);
 890         da = expand_alpha (d);
 891         s = pix_add_mul (s, da, d, sia);
 892         store8888 (dest, s);
 893
 894         ++dest;
 895         ++src;
 896         if (mask)
 897             mask++;
 898     }
 899     _mm_empty ();
 900 }
 901
 902 static void
 903 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
 904                             pixman_op_t              op,
 905                             uint32_t *               dest,
 906                             const uint32_t *         src,
 907                             const uint32_t *         mask,
 908                             int                      width)
 909 {
 910     const uint32_t *end;
 911
 912     end = dest + width;
 913
 914     while (dest < end)
 915     {
 916         __m64 dia, d, sa;
 917         __m64 s = combine (src, mask);
 918
 919         d = load8888 (dest);
 920         sa = expand_alpha (s);
 921         dia = expand_alpha (d);
 922         dia = negate (dia);
 923         s = pix_add_mul (s, dia, d, sa);
 924         store8888 (dest, s);
 925
 926         ++dest;
 927         ++src;
 928         if (mask)
 929             mask++;
 930     }
 931     _mm_empty ();
 932 }
 933
 934 static void
 935 mmx_combine_xor_u (pixman_implementation_t *imp,
 936                    pixman_op_t              op,
 937                    uint32_t *               dest,
 938                    const uint32_t *         src,
 939                    const uint32_t *         mask,
 940                    int                      width)
 941 {
 942     const uint32_t *end = dest + width;
 943
 944     while (dest < end)
 945     {
 946         __m64 dia, d, sia;
 947         __m64 s = combine (src, mask);
 948
 949         d = load8888 (dest);
 950         sia = expand_alpha (s);
 951         dia = expand_alpha (d);
 952         sia = negate (sia);
 953         dia = negate (dia);
 954         s = pix_add_mul (s, dia, d, sia);
 955         store8888 (dest, s);
 956
 957         ++dest;
 958         ++src;
 959         if (mask)
 960             mask++;
 961     }
 962     _mm_empty ();
 963 }
 964
 965 static void
 966 mmx_combine_add_u (pixman_implementation_t *imp,
 967                    pixman_op_t              op,
 968                    uint32_t *               dest,
 969                    const uint32_t *         src,
 970                    const uint32_t *         mask,
 971                    int                      width)
 972 {
 973     const uint32_t *end = dest + width;
 974
 975     while (dest < end)
 976     {
 977         __m64 d;
 978         __m64 s = combine (src, mask);
 979
 980         d = load8888 (dest);
 981         s = pix_add (s, d);
 982         store8888 (dest, s);
 983
 984         ++dest;
 985         ++src;
 986         if (mask)
 987             mask++;
 988     }
 989     _mm_empty ();
 990 }
 991
 992 static void
 993 mmx_combine_saturate_u (pixman_implementation_t *imp,
 994                         pixman_op_t              op,
 995                         uint32_t *               dest,
 996                         const uint32_t *         src,
 997                         const uint32_t *         mask,
 998                         int                      width)
 999 {
1000     const uint32_t *end = dest + width;
1001
1002     while (dest < end)
1003     {
1004         uint32_t s, sa, da;
1005         uint32_t d = *dest;
1006         __m64 ms = combine (src, mask);
1007         __m64 md = load8888 (dest);
1008
1009         store8888(&s, ms);
1010         da = ~d >> 24;
1011         sa = s >> 24;
1012
1013         if (sa > da)
1014         {
1015             uint32_t quot = DIV_UN8 (da, sa) << 24;
1016             __m64 msa = load8888 (&quot);
1017             msa = expand_alpha (msa);
1018             ms = pix_multiply (ms, msa);
1019         }
1020
1021         md = pix_add (md, ms);
1022         store8888 (dest, md);
1023
1024         ++src;
1025         ++dest;
1026         if (mask)
1027             mask++;
1028     }
1029     _mm_empty ();
1030 }
1031
1032 static void
1033 mmx_combine_src_ca (pixman_implementation_t *imp,
1034                     pixman_op_t              op,
1035                     uint32_t *               dest,
1036                     const uint32_t *         src,
1037                     const uint32_t *         mask,
1038                     int                      width)
1039 {
1040     const uint32_t *end = src + width;
1041
1042     while (src < end)
1043     {
1044         __m64 a = load8888 (mask);
1045         __m64 s = load8888 (src);
1046
1047         s = pix_multiply (s, a);
1048         store8888 (dest, s);
1049
1050         ++src;
1051         ++mask;
1052         ++dest;
1053     }
1054     _mm_empty ();
1055 }
1056
1057 static void
1058 mmx_combine_over_ca (pixman_implementation_t *imp,
1059                      pixman_op_t              op,
1060                      uint32_t *               dest,
1061                      const uint32_t *         src,
1062                      const uint32_t *         mask,
1063                      int                      width)
1064 {
1065     const uint32_t *end = src + width;
1066
1067     while (src < end)
1068     {
1069         __m64 a = load8888 (mask);
1070         __m64 s = load8888 (src);
1071         __m64 d = load8888 (dest);
1072         __m64 sa = expand_alpha (s);
1073
1074         store8888 (dest, in_over (s, sa, a, d));
1075
1076         ++src;
1077         ++dest;
1078         ++mask;
1079     }
1080     _mm_empty ();
1081 }
1082
1083 static void
1084 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1085                              pixman_op_t              op,
1086                              uint32_t *               dest,
1087                              const uint32_t *         src,
1088                              const uint32_t *         mask,
1089                              int                      width)
1090 {
1091     const uint32_t *end = src + width;
1092
1093     while (src < end)
1094     {
1095         __m64 a = load8888 (mask);
1096         __m64 s = load8888 (src);
1097         __m64 d = load8888 (dest);
1098         __m64 da = expand_alpha (d);
1099
1100         store8888 (dest, over (d, da, in (s, a)));
1101
1102         ++src;
1103         ++dest;
1104         ++mask;
1105     }
1106     _mm_empty ();
1107 }
1108
1109 static void
1110 mmx_combine_in_ca (pixman_implementation_t *imp,
1111                    pixman_op_t              op,
1112                    uint32_t *               dest,
1113                    const uint32_t *         src,
1114                    const uint32_t *         mask,
1115                    int                      width)
1116 {
1117     const uint32_t *end = src + width;
1118
1119     while (src < end)
1120     {
1121         __m64 a = load8888 (mask);
1122         __m64 s = load8888 (src);
1123         __m64 d = load8888 (dest);
1124         __m64 da = expand_alpha (d);
1125
1126         s = pix_multiply (s, a);
1127         s = pix_multiply (s, da);
1128         store8888 (dest, s);
1129
1130         ++src;
1131         ++dest;
1132         ++mask;
1133     }
1134     _mm_empty ();
1135 }
1136
1137 static void
1138 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1139                            pixman_op_t              op,
1140                            uint32_t *               dest,
1141                            const uint32_t *         src,
1142                            const uint32_t *         mask,
1143                            int                      width)
1144 {
1145     const uint32_t *end = src + width;
1146
1147     while (src < end)
1148     {
1149         __m64 a = load8888 (mask);
1150         __m64 s = load8888 (src);
1151         __m64 d = load8888 (dest);
1152         __m64 sa = expand_alpha (s);
1153
1154         a = pix_multiply (a, sa);
1155         d = pix_multiply (d, a);
1156         store8888 (dest, d);
1157
1158         ++src;
1159         ++dest;
1160         ++mask;
1161     }
1162     _mm_empty ();
1163 }
1164
1165 static void
1166 mmx_combine_out_ca (pixman_implementation_t *imp,
1167                     pixman_op_t              op,
1168                     uint32_t *               dest,
1169                     const uint32_t *         src,
1170                     const uint32_t *         mask,
1171                     int                      width)
1172 {
1173     const uint32_t *end = src + width;
1174
1175     while (src < end)
1176     {
1177         __m64 a = load8888 (mask);
1178         __m64 s = load8888 (src);
1179         __m64 d = load8888 (dest);
1180         __m64 da = expand_alpha (d);
1181
1182         da = negate (da);
1183         s = pix_multiply (s, a);
1184         s = pix_multiply (s, da);
1185         store8888 (dest, s);
1186
1187         ++src;
1188         ++dest;
1189         ++mask;
1190     }
1191     _mm_empty ();
1192 }
1193
1194 static void
1195 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1196                             pixman_op_t              op,
1197                             uint32_t *               dest,
1198                             const uint32_t *         src,
1199                             const uint32_t *         mask,
1200                             int                      width)
1201 {
1202     const uint32_t *end = src + width;
1203
1204     while (src < end)
1205     {
1206         __m64 a = load8888 (mask);
1207         __m64 s = load8888 (src);
1208         __m64 d = load8888 (dest);
1209         __m64 sa = expand_alpha (s);
1210
1211         a = pix_multiply (a, sa);
1212         a = negate (a);
1213         d = pix_multiply (d, a);
1214         store8888 (dest, d);
1215
1216         ++src;
1217         ++dest;
1218         ++mask;
1219     }
1220     _mm_empty ();
1221 }
1222
1223 static void
1224 mmx_combine_atop_ca (pixman_implementation_t *imp,
1225                      pixman_op_t              op,
1226                      uint32_t *               dest,
1227                      const uint32_t *         src,
1228                      const uint32_t *         mask,
1229                      int                      width)
1230 {
1231     const uint32_t *end = src + width;
1232
1233     while (src < end)
1234     {
1235         __m64 a = load8888 (mask);
1236         __m64 s = load8888 (src);
1237         __m64 d = load8888 (dest);
1238         __m64 da = expand_alpha (d);
1239         __m64 sa = expand_alpha (s);
1240
1241         s = pix_multiply (s, a);
1242         a = pix_multiply (a, sa);
1243         a = negate (a);
1244         d = pix_add_mul (d, a, s, da);
1245         store8888 (dest, d);
1246
1247         ++src;
1248         ++dest;
1249         ++mask;
1250     }
1251     _mm_empty ();
1252 }
1253
1254 static void
1255 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1256                              pixman_op_t              op,
1257                              uint32_t *               dest,
1258                              const uint32_t *         src,
1259                              const uint32_t *         mask,
1260                              int                      width)
1261 {
1262     const uint32_t *end = src + width;
1263
1264     while (src < end)
1265     {
1266         __m64 a = load8888 (mask);
1267         __m64 s = load8888 (src);
1268         __m64 d = load8888 (dest);
1269         __m64 da = expand_alpha (d);
1270         __m64 sa = expand_alpha (s);
1271
1272         s = pix_multiply (s, a);
1273         a = pix_multiply (a, sa);
1274         da = negate (da);
1275         d = pix_add_mul (d, a, s, da);
1276         store8888 (dest, d);
1277
1278         ++src;
1279         ++dest;
1280         ++mask;
1281     }
1282     _mm_empty ();
1283 }
1284
1285 static void
1286 mmx_combine_xor_ca (pixman_implementation_t *imp,
1287                     pixman_op_t              op,
1288                     uint32_t *               dest,
1289                     const uint32_t *         src,
1290                     const uint32_t *         mask,
1291                     int                      width)
1292 {
1293     const uint32_t *end = src + width;
1294
1295     while (src < end)
1296     {
1297         __m64 a = load8888 (mask);
1298         __m64 s = load8888 (src);
1299         __m64 d = load8888 (dest);
1300         __m64 da = expand_alpha (d);
1301         __m64 sa = expand_alpha (s);
1302
1303         s = pix_multiply (s, a);
1304         a = pix_multiply (a, sa);
1305         da = negate (da);
1306         a = negate (a);
1307         d = pix_add_mul (d, a, s, da);
1308         store8888 (dest, d);
1309
1310         ++src;
1311         ++dest;
1312         ++mask;
1313     }
1314     _mm_empty ();
1315 }
1316
1317 static void
1318 mmx_combine_add_ca (pixman_implementation_t *imp,
1319                     pixman_op_t              op,
1320                     uint32_t *               dest,
1321                     const uint32_t *         src,
1322                     const uint32_t *         mask,
1323                     int                      width)
1324 {
1325     const uint32_t *end = src + width;
1326
1327     while (src < end)
1328     {
1329         __m64 a = load8888 (mask);
1330         __m64 s = load8888 (src);
1331         __m64 d = load8888 (dest);
1332
1333         s = pix_multiply (s, a);
1334         d = pix_add (s, d);
1335         store8888 (dest, d);
1336
1337         ++src;
1338         ++dest;
1339         ++mask;
1340     }
1341     _mm_empty ();
1342 }
1343
1344 /* ------------- MMX code paths called from fbpict.c -------------------- */
1345
1346 static void
1347 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1348                            pixman_composite_info_t *info)
1349 {
1350     PIXMAN_COMPOSITE_ARGS (info);
1351     uint32_t src;
1352     uint32_t    *dst_line, *dst;
1353     int32_t w;
1354     int dst_stride;
1355     __m64 vsrc, vsrca;
1356
1357     CHECKPOINT ();
1358
1359     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1360
1361     if (src == 0)
1362         return;
1363
1364     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1365
1366     vsrc = load8888 (&src);
1367     vsrca = expand_alpha (vsrc);
1368
1369     while (height--)
1370     {
1371         dst = dst_line;
1372         dst_line += dst_stride;
1373         w = width;
1374
1375         CHECKPOINT ();
1376
1377         while (w && (unsigned long)dst & 7)
1378         {
1379             store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1380
1381             w--;
1382             dst++;
1383         }
1384
1385         while (w >= 2)
1386         {
1387             __m64 vdest;
1388             __m64 dest0, dest1;
1389
1390             vdest = *(__m64 *)dst;
1391
1392             dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1393             dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1394
1395             *(__m64 *)dst = pack8888 (dest0, dest1);
1396
1397             dst += 2;
1398             w -= 2;
1399         }
1400
1401         CHECKPOINT ();
1402
1403         if (w)
1404         {
1405             store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1406         }
1407     }
1408
1409     _mm_empty ();
1410 }
1411
1412 static void
1413 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1414                            pixman_composite_info_t *info)
1415 {
1416     PIXMAN_COMPOSITE_ARGS (info);
1417     uint32_t src;
1418     uint16_t    *dst_line, *dst;
1419     int32_t w;
1420     int dst_stride;
1421     __m64 vsrc, vsrca;
1422
1423     CHECKPOINT ();
1424
1425     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1426
1427     if (src == 0)
1428         return;
1429
1430     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1431
1432     vsrc = load8888 (&src);
1433     vsrca = expand_alpha (vsrc);
1434
1435     while (height--)
1436     {
1437         dst = dst_line;
1438         dst_line += dst_stride;
1439         w = width;
1440
1441         CHECKPOINT ();
1442
1443         while (w && (unsigned long)dst & 7)
1444         {
1445             uint64_t d = *dst;
1446             __m64 vdest = expand565 (to_m64 (d), 0);
1447
1448             vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1449             *dst = to_uint64 (vdest);
1450
1451             w--;
1452             dst++;
1453         }
1454
1455         while (w >= 4)
1456         {
1457             __m64 vdest = *(__m64 *)dst;
1458             __m64 v0, v1, v2, v3;
1459
1460             expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1461
1462             v0 = over (vsrc, vsrca, v0);
1463             v1 = over (vsrc, vsrca, v1);
1464             v2 = over (vsrc, vsrca, v2);
1465             v3 = over (vsrc, vsrca, v3);
1466
1467             *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1468
1469             dst += 4;
1470             w -= 4;
1471         }
1472
1473         CHECKPOINT ();
1474
1475         while (w)
1476         {
1477             uint64_t d = *dst;
1478             __m64 vdest = expand565 (to_m64 (d), 0);
1479
1480             vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1481             *dst = to_uint64 (vdest);
1482
1483             w--;
1484             dst++;
1485         }
1486     }
1487
1488     _mm_empty ();
1489 }
1490
1491 static void
1492 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1493                                    pixman_composite_info_t *info)
1494 {
1495     PIXMAN_COMPOSITE_ARGS (info);
1496     uint32_t src;
1497     uint32_t    *dst_line;
1498     uint32_t    *mask_line;
1499     int dst_stride, mask_stride;
1500     __m64 vsrc, vsrca;
1501
1502     CHECKPOINT ();
1503
1504     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1505
1506     if (src == 0)
1507         return;
1508
1509     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1510     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1511
1512     vsrc = load8888 (&src);
1513     vsrca = expand_alpha (vsrc);
1514
1515     while (height--)
1516     {
1517         int twidth = width;
1518         uint32_t *p = (uint32_t *)mask_line;
1519         uint32_t *q = (uint32_t *)dst_line;
1520
1521         while (twidth && (unsigned long)q & 7)
1522         {
1523             uint32_t m = *(uint32_t *)p;
1524
1525             if (m)
1526             {
1527                 __m64 vdest = load8888 (q);
1528                 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1529                 store8888 (q, vdest);
1530             }
1531
1532             twidth--;
1533             p++;
1534             q++;
1535         }
1536
1537         while (twidth >= 2)
1538         {
1539             uint32_t m0, m1;
1540             m0 = *p;
1541             m1 = *(p + 1);
1542
1543             if (m0 | m1)
1544             {
1545                 __m64 dest0, dest1;
1546                 __m64 vdest = *(__m64 *)q;
1547
1548                 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1549                                  expand8888 (vdest, 0));
1550                 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1551                                  expand8888 (vdest, 1));
1552
1553                 *(__m64 *)q = pack8888 (dest0, dest1);
1554             }
1555
1556             p += 2;
1557             q += 2;
1558             twidth -= 2;
1559         }
1560
1561         if (twidth)
1562         {
1563             uint32_t m = *(uint32_t *)p;
1564
1565             if (m)
1566             {
1567                 __m64 vdest = load8888 (q);
1568                 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1569                 store8888 (q, vdest);
1570             }
1571
1572             twidth--;
1573             p++;
1574             q++;
1575         }
1576
1577         dst_line += dst_stride;
1578         mask_line += mask_stride;
1579     }
1580
1581     _mm_empty ();
1582 }
1583
1584 static void
1585 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1586                                 pixman_composite_info_t *info)
1587 {
1588     PIXMAN_COMPOSITE_ARGS (info);
1589     uint32_t    *dst_line, *dst;
1590     uint32_t    *src_line, *src;
1591     uint32_t mask;
1592     __m64 vmask;
1593     int dst_stride, src_stride;
1594     int32_t w;
1595
1596     CHECKPOINT ();
1597
1598     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1599     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1600
1601     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1602     mask &= 0xff000000;
1603     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1604     vmask = load8888 (&mask);
1605
1606     while (height--)
1607     {
1608         dst = dst_line;
1609         dst_line += dst_stride;
1610         src = src_line;
1611         src_line += src_stride;
1612         w = width;
1613
1614         while (w && (unsigned long)dst & 7)
1615         {
1616             __m64 s = load8888 (src);
1617             __m64 d = load8888 (dst);
1618
1619             store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1620
1621             w--;
1622             dst++;
1623             src++;
1624         }
1625
1626         while (w >= 2)
1627         {
1628             __m64 vs = ldq_u ((__m64 *)src);
1629             __m64 vd = *(__m64 *)dst;
1630             __m64 vsrc0 = expand8888 (vs, 0);
1631             __m64 vsrc1 = expand8888 (vs, 1);
1632
1633             *(__m64 *)dst = pack8888 (
1634                 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1635                 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1636
1637             w -= 2;
1638             dst += 2;
1639             src += 2;
1640         }
1641
1642         if (w)
1643         {
1644             __m64 s = load8888 (src);
1645             __m64 d = load8888 (dst);
1646
1647             store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1648         }
1649     }
1650
1651     _mm_empty ();
1652 }
1653
1654 static void
1655 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1656                                 pixman_composite_info_t *info)
1657 {
1658     PIXMAN_COMPOSITE_ARGS (info);
1659     uint32_t *dst_line, *dst;
1660     uint32_t *src_line, *src;
1661     uint32_t mask;
1662     __m64 vmask;
1663     int dst_stride, src_stride;
1664     int32_t w;
1665     __m64 srca;
1666
1667     CHECKPOINT ();
1668
1669     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1670     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1671     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1672
1673     mask &= 0xff000000;
1674     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1675     vmask = load8888 (&mask);
1676     srca = MC (4x00ff);
1677
1678     while (height--)
1679     {
1680         dst = dst_line;
1681         dst_line += dst_stride;
1682         src = src_line;
1683         src_line += src_stride;
1684         w = width;
1685
1686         while (w && (unsigned long)dst & 7)
1687         {
1688             uint32_t ssrc = *src | 0xff000000;
1689             __m64 s = load8888 (&ssrc);
1690             __m64 d = load8888 (dst);
1691
1692             store8888 (dst, in_over (s, srca, vmask, d));
1693
1694             w--;
1695             dst++;
1696             src++;
1697         }
1698
1699         while (w >= 16)
1700         {
1701             __m64 vd0 = *(__m64 *)(dst + 0);
1702             __m64 vd1 = *(__m64 *)(dst + 2);
1703             __m64 vd2 = *(__m64 *)(dst + 4);
1704             __m64 vd3 = *(__m64 *)(dst + 6);
1705             __m64 vd4 = *(__m64 *)(dst + 8);
1706             __m64 vd5 = *(__m64 *)(dst + 10);
1707             __m64 vd6 = *(__m64 *)(dst + 12);
1708             __m64 vd7 = *(__m64 *)(dst + 14);
1709
1710             __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1711             __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1712             __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1713             __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1714             __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1715             __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1716             __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1717             __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1718
1719             vd0 = pack8888 (
1720                 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1721                 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1722
1723             vd1 = pack8888 (
1724                 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1725                 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1726
1727             vd2 = pack8888 (
1728                 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1729                 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1730
1731             vd3 = pack8888 (
1732                 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1733                 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1734
1735             vd4 = pack8888 (
1736                 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1737                 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1738
1739             vd5 = pack8888 (
1740                 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1741                 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1742
1743             vd6 = pack8888 (
1744                 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1745                 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1746
1747             vd7 = pack8888 (
1748                 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1749                 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1750
1751             *(__m64 *)(dst + 0) = vd0;
1752             *(__m64 *)(dst + 2) = vd1;
1753             *(__m64 *)(dst + 4) = vd2;
1754             *(__m64 *)(dst + 6) = vd3;
1755             *(__m64 *)(dst + 8) = vd4;
1756             *(__m64 *)(dst + 10) = vd5;
1757             *(__m64 *)(dst + 12) = vd6;
1758             *(__m64 *)(dst + 14) = vd7;
1759
1760             w -= 16;
1761             dst += 16;
1762             src += 16;
1763         }
1764
1765         while (w)
1766         {
1767             uint32_t ssrc = *src | 0xff000000;
1768             __m64 s = load8888 (&ssrc);
1769             __m64 d = load8888 (dst);
1770
1771             store8888 (dst, in_over (s, srca, vmask, d));
1772
1773             w--;
1774             dst++;
1775             src++;
1776         }
1777     }
1778
1779     _mm_empty ();
1780 }
1781
1782 static void
1783 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1784                               pixman_composite_info_t *info)
1785 {
1786     PIXMAN_COMPOSITE_ARGS (info);
1787     uint32_t *dst_line, *dst;
1788     uint32_t *src_line, *src;
1789     uint32_t s;
1790     int dst_stride, src_stride;
1791     uint8_t a;
1792     int32_t w;
1793
1794     CHECKPOINT ();
1795
1796     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1797     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1798
1799     while (height--)
1800     {
1801         dst = dst_line;
1802         dst_line += dst_stride;
1803         src = src_line;
1804         src_line += src_stride;
1805         w = width;
1806
1807         while (w--)
1808         {
1809             s = *src++;
1810             a = s >> 24;
1811
1812             if (a == 0xff)
1813             {
1814                 *dst = s;
1815             }
1816             else if (s)
1817             {
1818                 __m64 ms, sa;
1819                 ms = load8888 (&s);
1820                 sa = expand_alpha (ms);
1821                 store8888 (dst, over (ms, sa, load8888 (dst)));
1822             }
1823
1824             dst++;
1825         }
1826     }
1827     _mm_empty ();
1828 }
1829
1830 static void
1831 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1832                               pixman_composite_info_t *info)
1833 {
1834     PIXMAN_COMPOSITE_ARGS (info);
1835     uint16_t    *dst_line, *dst;
1836     uint32_t    *src_line, *src;
1837     int dst_stride, src_stride;
1838     int32_t w;
1839
1840     CHECKPOINT ();
1841
1842     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1843     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1844
1845 #if 0
1846     /* FIXME */
1847     assert (src_image->drawable == mask_image->drawable);
1848 #endif
1849
1850     while (height--)
1851     {
1852         dst = dst_line;
1853         dst_line += dst_stride;
1854         src = src_line;
1855         src_line += src_stride;
1856         w = width;
1857
1858         CHECKPOINT ();
1859
1860         while (w && (unsigned long)dst & 7)
1861         {
1862             __m64 vsrc = load8888 (src);
1863             uint64_t d = *dst;
1864             __m64 vdest = expand565 (to_m64 (d), 0);
1865
1866             vdest = pack_565 (
1867                 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1868
1869             *dst = to_uint64 (vdest);
1870
1871             w--;
1872             dst++;
1873             src++;
1874         }
1875
1876         CHECKPOINT ();
1877
1878         while (w >= 4)
1879         {
1880             __m64 vdest = *(__m64 *)dst;
1881             __m64 v0, v1, v2, v3;
1882
1883             expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1884
1885             __m64 vsrc0 = load8888 ((src + 0));
1886             __m64 vsrc1 = load8888 ((src + 1));
1887             __m64 vsrc2 = load8888 ((src + 2));
1888             __m64 vsrc3 = load8888 ((src + 3));
1889
1890             v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1891             v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1892             v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1893             v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1894
1895             *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1896
1897             w -= 4;
1898             dst += 4;
1899             src += 4;
1900         }
1901
1902         CHECKPOINT ();
1903
1904         while (w)
1905         {
1906             __m64 vsrc = load8888 (src);
1907             uint64_t d = *dst;
1908             __m64 vdest = expand565 (to_m64 (d), 0);
1909
1910             vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1911
1912             *dst = to_uint64 (vdest);
1913
1914             w--;
1915             dst++;
1916             src++;
1917         }
1918     }
1919
1920     _mm_empty ();
1921 }
1922
1923 static void
1924 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1925                              pixman_composite_info_t *info)
1926 {
1927     PIXMAN_COMPOSITE_ARGS (info);
1928     uint32_t src, srca;
1929     uint32_t *dst_line, *dst;
1930     uint8_t *mask_line, *mask;
1931     int dst_stride, mask_stride;
1932     int32_t w;
1933     __m64 vsrc, vsrca;
1934     uint64_t srcsrc;
1935
1936     CHECKPOINT ();
1937
1938     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1939
1940     srca = src >> 24;
1941     if (src == 0)
1942         return;
1943
1944     srcsrc = (uint64_t)src << 32 | src;
1945
1946     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1947     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1948
1949     vsrc = load8888 (&src);
1950     vsrca = expand_alpha (vsrc);
1951
1952     while (height--)
1953     {
1954         dst = dst_line;
1955         dst_line += dst_stride;
1956         mask = mask_line;
1957         mask_line += mask_stride;
1958         w = width;
1959
1960         CHECKPOINT ();
1961
1962         while (w && (unsigned long)dst & 7)
1963         {
1964             uint64_t m = *mask;
1965
1966             if (m)
1967             {
1968                 __m64 vdest = in_over (vsrc, vsrca,
1969                                        expand_alpha_rev (to_m64 (m)),
1970                                        load8888 (dst));
1971
1972                 store8888 (dst, vdest);
1973             }
1974
1975             w--;
1976             mask++;
1977             dst++;
1978         }
1979
1980         CHECKPOINT ();
1981
1982         while (w >= 2)
1983         {
1984             uint64_t m0, m1;
1985
1986             m0 = *mask;
1987             m1 = *(mask + 1);
1988
1989             if (srca == 0xff && (m0 & m1) == 0xff)
1990             {
1991                 *(uint64_t *)dst = srcsrc;
1992             }
1993             else if (m0 | m1)
1994             {
1995                 __m64 vdest;
1996                 __m64 dest0, dest1;
1997
1998                 vdest = *(__m64 *)dst;
1999
2000                 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2001                                  expand8888 (vdest, 0));
2002                 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2003                                  expand8888 (vdest, 1));
2004
2005                 *(__m64 *)dst = pack8888 (dest0, dest1);
2006             }
2007
2008             mask += 2;
2009             dst += 2;
2010             w -= 2;
2011         }
2012
2013         CHECKPOINT ();
2014
2015         if (w)
2016         {
2017             uint64_t m = *mask;
2018
2019             if (m)
2020             {
2021                 __m64 vdest = load8888 (dst);
2022
2023                 vdest = in_over (
2024                     vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2025                 store8888 (dst, vdest);
2026             }
2027         }
2028     }
2029
2030     _mm_empty ();
2031 }
2032
2033 pixman_bool_t
2034 pixman_fill_mmx (uint32_t *bits,
2035                  int       stride,
2036                  int       bpp,
2037                  int       x,
2038                  int       y,
2039                  int       width,
2040                  int       height,
2041                  uint32_t xor)
2042 {
2043     uint64_t fill;
2044     __m64 vfill;
2045     uint32_t byte_width;
2046     uint8_t     *byte_line;
2047
2048 #if defined __GNUC__ && defined USE_X86_MMX
2049     __m64 v1, v2, v3, v4, v5, v6, v7;
2050 #endif
2051
2052     if (bpp != 16 && bpp != 32 && bpp != 8)
2053         return FALSE;
2054
2055     if (bpp == 8)
2056     {
2057         stride = stride * (int) sizeof (uint32_t) / 1;
2058         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2059         byte_width = width;
2060         stride *= 1;
2061         xor = (xor & 0xff) * 0x01010101;
2062     }
2063     else if (bpp == 16)
2064     {
2065         stride = stride * (int) sizeof (uint32_t) / 2;
2066         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2067         byte_width = 2 * width;
2068         stride *= 2;
2069         xor = (xor & 0xffff) * 0x00010001;
2070     }
2071     else
2072     {
2073         stride = stride * (int) sizeof (uint32_t) / 4;
2074         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2075         byte_width = 4 * width;
2076         stride *= 4;
2077     }
2078
2079     fill = ((uint64_t)xor << 32) | xor;
2080     vfill = to_m64 (fill);
2081
2082 #if defined __GNUC__ && defined USE_X86_MMX
2083     __asm__ (
2084         "movq           %7,     %0\n"
2085         "movq           %7,     %1\n"
2086         "movq           %7,     %2\n"
2087         "movq           %7,     %3\n"
2088         "movq           %7,     %4\n"
2089         "movq           %7,     %5\n"
2090         "movq           %7,     %6\n"
2091         : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2092           "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2093         : "y" (vfill));
2094 #endif
2095
2096     while (height--)
2097     {
2098         int w;
2099         uint8_t *d = byte_line;
2100
2101         byte_line += stride;
2102         w = byte_width;
2103
2104         if (w >= 1 && ((unsigned long)d & 1))
2105         {
2106             *(uint8_t *)d = (xor & 0xff);
2107             w--;
2108             d++;
2109         }
2110
2111         if (w >= 2 && ((unsigned long)d & 3))
2112         {
2113             *(uint16_t *)d = xor;
2114             w -= 2;
2115             d += 2;
2116         }
2117
2118         while (w >= 4 && ((unsigned long)d & 7))
2119         {
2120             *(uint32_t *)d = xor;
2121
2122             w -= 4;
2123             d += 4;
2124         }
2125
2126         while (w >= 64)
2127         {
2128 #if defined __GNUC__ && defined USE_X86_MMX
2129             __asm__ (
2130                 "movq   %1,       (%0)\n"
2131                 "movq   %2,      8(%0)\n"
2132                 "movq   %3,     16(%0)\n"
2133                 "movq   %4,     24(%0)\n"
2134                 "movq   %5,     32(%0)\n"
2135                 "movq   %6,     40(%0)\n"
2136                 "movq   %7,     48(%0)\n"
2137                 "movq   %8,     56(%0)\n"
2138                 :
2139                 : "r" (d),
2140                   "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2141                   "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2142                 : "memory");
2143 #else
2144             *(__m64*) (d +  0) = vfill;
2145             *(__m64*) (d +  8) = vfill;
2146             *(__m64*) (d + 16) = vfill;
2147             *(__m64*) (d + 24) = vfill;
2148             *(__m64*) (d + 32) = vfill;
2149             *(__m64*) (d + 40) = vfill;
2150             *(__m64*) (d + 48) = vfill;
2151             *(__m64*) (d + 56) = vfill;
2152 #endif
2153             w -= 64;
2154             d += 64;
2155         }
2156
2157         while (w >= 4)
2158         {
2159             *(uint32_t *)d = xor;
2160
2161             w -= 4;
2162             d += 4;
2163         }
2164         if (w >= 2)
2165         {
2166             *(uint16_t *)d = xor;
2167             w -= 2;
2168             d += 2;
2169         }
2170         if (w >= 1)
2171         {
2172             *(uint8_t *)d = (xor & 0xff);
2173             w--;
2174             d++;
2175         }
2176
2177     }
2178
2179     _mm_empty ();
2180     return TRUE;
2181 }
2182
2183 static void
2184 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2185                              pixman_composite_info_t *info)
2186 {
2187     PIXMAN_COMPOSITE_ARGS (info);
2188     uint16_t    *dst_line, *dst;
2189     uint32_t    *src_line, *src, s;
2190     int dst_stride, src_stride;
2191     int32_t w;
2192
2193     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2194     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2195
2196     while (height--)
2197     {
2198         dst = dst_line;
2199         dst_line += dst_stride;
2200         src = src_line;
2201         src_line += src_stride;
2202         w = width;
2203
2204         while (w && (unsigned long)dst & 7)
2205         {
2206             s = *src++;
2207             *dst = CONVERT_8888_TO_0565 (s);
2208             dst++;
2209             w--;
2210         }
2211
2212         while (w >= 4)
2213         {
2214             __m64 vdest;
2215             __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2216             __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2217
2218             vdest = pack_4xpacked565 (vsrc0, vsrc1);
2219
2220             *(__m64 *)dst = vdest;
2221
2222             w -= 4;
2223             src += 4;
2224             dst += 4;
2225         }
2226
2227         while (w)
2228         {
2229             s = *src++;
2230             *dst = CONVERT_8888_TO_0565 (s);
2231             dst++;
2232             w--;
2233         }
2234     }
2235 }
2236
2237 static void
2238 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2239                             pixman_composite_info_t *info)
2240 {
2241     PIXMAN_COMPOSITE_ARGS (info);
2242     uint32_t src, srca;
2243     uint32_t    *dst_line, *dst;
2244     uint8_t     *mask_line, *mask;
2245     int dst_stride, mask_stride;
2246     int32_t w;
2247     __m64 vsrc;
2248     uint64_t srcsrc;
2249
2250     CHECKPOINT ();
2251
2252     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2253
2254     srca = src >> 24;
2255     if (src == 0)
2256     {
2257         pixman_fill_mmx (dest_image->bits.bits, dest_image->bits.rowstride,
2258                          PIXMAN_FORMAT_BPP (dest_image->bits.format),
2259                          dest_x, dest_y, width, height, 0);
2260         return;
2261     }
2262
2263     srcsrc = (uint64_t)src << 32 | src;
2264
2265     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2266     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2267
2268     vsrc = load8888 (&src);
2269
2270     while (height--)
2271     {
2272         dst = dst_line;
2273         dst_line += dst_stride;
2274         mask = mask_line;
2275         mask_line += mask_stride;
2276         w = width;
2277
2278         CHECKPOINT ();
2279
2280         while (w && (unsigned long)dst & 7)
2281         {
2282             uint64_t m = *mask;
2283
2284             if (m)
2285             {
2286                 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2287
2288                 store8888 (dst, vdest);
2289             }
2290             else
2291             {
2292                 *dst = 0;
2293             }
2294
2295             w--;
2296             mask++;
2297             dst++;
2298         }
2299
2300         CHECKPOINT ();
2301
2302         while (w >= 2)
2303         {
2304             uint64_t m0, m1;
2305             m0 = *mask;
2306             m1 = *(mask + 1);
2307
2308             if (srca == 0xff && (m0 & m1) == 0xff)
2309             {
2310                 *(uint64_t *)dst = srcsrc;
2311             }
2312             else if (m0 | m1)
2313             {
2314                 __m64 dest0, dest1;
2315
2316                 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2317                 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2318
2319                 *(__m64 *)dst = pack8888 (dest0, dest1);
2320             }
2321             else
2322             {
2323                 *(uint64_t *)dst = 0;
2324             }
2325
2326             mask += 2;
2327             dst += 2;
2328             w -= 2;
2329         }
2330
2331         CHECKPOINT ();
2332
2333         if (w)
2334         {
2335             uint64_t m = *mask;
2336
2337             if (m)
2338             {
2339                 __m64 vdest = load8888 (dst);
2340
2341                 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2342                 store8888 (dst, vdest);
2343             }
2344             else
2345             {
2346                 *dst = 0;
2347             }
2348         }
2349     }
2350
2351     _mm_empty ();
2352 }
2353
2354 static void
2355 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2356                              pixman_composite_info_t *info)
2357 {
2358     PIXMAN_COMPOSITE_ARGS (info);
2359     uint32_t src, srca;
2360     uint16_t *dst_line, *dst;
2361     uint8_t *mask_line, *mask;
2362     int dst_stride, mask_stride;
2363     int32_t w;
2364     __m64 vsrc, vsrca, tmp;
2365     __m64 srcsrcsrcsrc;
2366
2367     CHECKPOINT ();
2368
2369     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2370
2371     srca = src >> 24;
2372     if (src == 0)
2373         return;
2374
2375     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2376     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2377
2378     vsrc = load8888 (&src);
2379     vsrca = expand_alpha (vsrc);
2380
2381     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2382     srcsrcsrcsrc = expand_alpha_rev (tmp);
2383
2384     while (height--)
2385     {
2386         dst = dst_line;
2387         dst_line += dst_stride;
2388         mask = mask_line;
2389         mask_line += mask_stride;
2390         w = width;
2391
2392         CHECKPOINT ();
2393
2394         while (w && (unsigned long)dst & 7)
2395         {
2396             uint64_t m = *mask;
2397
2398             if (m)
2399             {
2400                 uint64_t d = *dst;
2401                 __m64 vd = to_m64 (d);
2402                 __m64 vdest = in_over (
2403                     vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2404
2405                 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2406                 *dst = to_uint64 (vd);
2407             }
2408
2409             w--;
2410             mask++;
2411             dst++;
2412         }
2413
2414         CHECKPOINT ();
2415
2416         while (w >= 4)
2417         {
2418             uint64_t m0, m1, m2, m3;
2419             m0 = *mask;
2420             m1 = *(mask + 1);
2421             m2 = *(mask + 2);
2422             m3 = *(mask + 3);
2423
2424             if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2425             {
2426                 *(__m64 *)dst = srcsrcsrcsrc;
2427             }
2428             else if (m0 | m1 | m2 | m3)
2429             {
2430                 __m64 vdest = *(__m64 *)dst;
2431                 __m64 v0, v1, v2, v3;
2432
2433                 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2434
2435                 __m64 vm0 = to_m64 (m0);
2436                 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2437
2438                 __m64 vm1 = to_m64 (m1);
2439                 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2440
2441                 __m64 vm2 = to_m64 (m2);
2442                 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2443
2444                 __m64 vm3 = to_m64 (m3);
2445                 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2446
2447                 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2448             }
2449
2450             w -= 4;
2451             mask += 4;
2452             dst += 4;
2453         }
2454
2455         CHECKPOINT ();
2456
2457         while (w)
2458         {
2459             uint64_t m = *mask;
2460
2461             if (m)
2462             {
2463                 uint64_t d = *dst;
2464                 __m64 vd = to_m64 (d);
2465                 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2466                                        expand565 (vd, 0));
2467                 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2468                 *dst = to_uint64 (vd);
2469             }
2470
2471             w--;
2472             mask++;
2473             dst++;
2474         }
2475     }
2476
2477     _mm_empty ();
2478 }
2479
2480 static void
2481 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2482                                 pixman_composite_info_t *info)
2483 {
2484     PIXMAN_COMPOSITE_ARGS (info);
2485     uint16_t    *dst_line, *dst;
2486     uint32_t    *src_line, *src;
2487     int dst_stride, src_stride;
2488     int32_t w;
2489
2490     CHECKPOINT ();
2491
2492     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2493     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2494
2495 #if 0
2496     /* FIXME */
2497     assert (src_image->drawable == mask_image->drawable);
2498 #endif
2499
2500     while (height--)
2501     {
2502         dst = dst_line;
2503         dst_line += dst_stride;
2504         src = src_line;
2505         src_line += src_stride;
2506         w = width;
2507
2508         CHECKPOINT ();
2509
2510         while (w && (unsigned long)dst & 7)
2511         {
2512             __m64 vsrc = load8888 (src);
2513             uint64_t d = *dst;
2514             __m64 vdest = expand565 (to_m64 (d), 0);
2515
2516             vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2517
2518             *dst = to_uint64 (vdest);
2519
2520             w--;
2521             dst++;
2522             src++;
2523         }
2524
2525         CHECKPOINT ();
2526
2527         while (w >= 4)
2528         {
2529             uint32_t s0, s1, s2, s3;
2530             unsigned char a0, a1, a2, a3;
2531
2532             s0 = *src;
2533             s1 = *(src + 1);
2534             s2 = *(src + 2);
2535             s3 = *(src + 3);
2536
2537             a0 = (s0 >> 24);
2538             a1 = (s1 >> 24);
2539             a2 = (s2 >> 24);
2540             a3 = (s3 >> 24);
2541
2542             if ((a0 & a1 & a2 & a3) == 0xFF)
2543             {
2544                 __m64 v0 = invert_colors (load8888 (&s0));
2545                 __m64 v1 = invert_colors (load8888 (&s1));
2546                 __m64 v2 = invert_colors (load8888 (&s2));
2547                 __m64 v3 = invert_colors (load8888 (&s3));
2548
2549                 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2550             }
2551             else if (s0 | s1 | s2 | s3)
2552             {
2553                 __m64 vdest = *(__m64 *)dst;
2554                 __m64 v0, v1, v2, v3;
2555
2556                 __m64 vsrc0 = load8888 (&s0);
2557                 __m64 vsrc1 = load8888 (&s1);
2558                 __m64 vsrc2 = load8888 (&s2);
2559                 __m64 vsrc3 = load8888 (&s3);
2560
2561                 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2562
2563                 v0 = over_rev_non_pre (vsrc0, v0);
2564                 v1 = over_rev_non_pre (vsrc1, v1);
2565                 v2 = over_rev_non_pre (vsrc2, v2);
2566                 v3 = over_rev_non_pre (vsrc3, v3);
2567
2568                 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2569             }
2570
2571             w -= 4;
2572             dst += 4;
2573             src += 4;
2574         }
2575
2576         CHECKPOINT ();
2577
2578         while (w)
2579         {
2580             __m64 vsrc = load8888 (src);
2581             uint64_t d = *dst;
2582             __m64 vdest = expand565 (to_m64 (d), 0);
2583
2584             vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2585
2586             *dst = to_uint64 (vdest);
2587
2588             w--;
2589             dst++;
2590             src++;
2591         }
2592     }
2593
2594     _mm_empty ();
2595 }
2596
2597 static void
2598 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2599                                 pixman_composite_info_t *info)
2600 {
2601     PIXMAN_COMPOSITE_ARGS (info);
2602     uint32_t    *dst_line, *dst;
2603     uint32_t    *src_line, *src;
2604     int dst_stride, src_stride;
2605     int32_t w;
2606
2607     CHECKPOINT ();
2608
2609     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2610     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2611
2612 #if 0
2613     /* FIXME */
2614     assert (src_image->drawable == mask_image->drawable);
2615 #endif
2616
2617     while (height--)
2618     {
2619         dst = dst_line;
2620         dst_line += dst_stride;
2621         src = src_line;
2622         src_line += src_stride;
2623         w = width;
2624
2625         while (w && (unsigned long)dst & 7)
2626         {
2627             __m64 s = load8888 (src);
2628             __m64 d = load8888 (dst);
2629
2630             store8888 (dst, over_rev_non_pre (s, d));
2631
2632             w--;
2633             dst++;
2634             src++;
2635         }
2636
2637         while (w >= 2)
2638         {
2639             uint32_t s0, s1;
2640             unsigned char a0, a1;
2641             __m64 d0, d1;
2642
2643             s0 = *src;
2644             s1 = *(src + 1);
2645
2646             a0 = (s0 >> 24);
2647             a1 = (s1 >> 24);
2648
2649             if ((a0 & a1) == 0xFF)
2650             {
2651                 d0 = invert_colors (load8888 (&s0));
2652                 d1 = invert_colors (load8888 (&s1));
2653
2654                 *(__m64 *)dst = pack8888 (d0, d1);
2655             }
2656             else if (s0 | s1)
2657             {
2658                 __m64 vdest = *(__m64 *)dst;
2659
2660                 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2661                 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2662
2663                 *(__m64 *)dst = pack8888 (d0, d1);
2664             }
2665
2666             w -= 2;
2667             dst += 2;
2668             src += 2;
2669         }
2670
2671         if (w)
2672         {
2673             __m64 s = load8888 (src);
2674             __m64 d = load8888 (dst);
2675
2676             store8888 (dst, over_rev_non_pre (s, d));
2677         }
2678     }
2679
2680     _mm_empty ();
2681 }
2682
2683 static void
2684 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2685                                    pixman_composite_info_t *info)
2686 {
2687     PIXMAN_COMPOSITE_ARGS (info);
2688     uint32_t src;
2689     uint16_t    *dst_line;
2690     uint32_t    *mask_line;
2691     int dst_stride, mask_stride;
2692     __m64 vsrc, vsrca;
2693
2694     CHECKPOINT ();
2695
2696     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2697
2698     if (src == 0)
2699         return;
2700
2701     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2702     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2703
2704     vsrc = load8888 (&src);
2705     vsrca = expand_alpha (vsrc);
2706
2707     while (height--)
2708     {
2709         int twidth = width;
2710         uint32_t *p = (uint32_t *)mask_line;
2711         uint16_t *q = (uint16_t *)dst_line;
2712
2713         while (twidth && ((unsigned long)q & 7))
2714         {
2715             uint32_t m = *(uint32_t *)p;
2716
2717             if (m)
2718             {
2719                 uint64_t d = *q;
2720                 __m64 vdest = expand565 (to_m64 (d), 0);
2721                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2722                 *q = to_uint64 (vdest);
2723             }
2724
2725             twidth--;
2726             p++;
2727             q++;
2728         }
2729
2730         while (twidth >= 4)
2731         {
2732             uint32_t m0, m1, m2, m3;
2733
2734             m0 = *p;
2735             m1 = *(p + 1);
2736             m2 = *(p + 2);
2737             m3 = *(p + 3);
2738
2739             if ((m0 | m1 | m2 | m3))
2740             {
2741                 __m64 vdest = *(__m64 *)q;
2742                 __m64 v0, v1, v2, v3;
2743
2744                 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2745
2746                 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2747                 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2748                 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2749                 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2750
2751                 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2752             }
2753             twidth -= 4;
2754             p += 4;
2755             q += 4;
2756         }
2757
2758         while (twidth)
2759         {
2760             uint32_t m;
2761
2762             m = *(uint32_t *)p;
2763             if (m)
2764             {
2765                 uint64_t d = *q;
2766                 __m64 vdest = expand565 (to_m64 (d), 0);
2767                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2768                 *q = to_uint64 (vdest);
2769             }
2770
2771             twidth--;
2772             p++;
2773             q++;
2774         }
2775
2776         mask_line += mask_stride;
2777         dst_line += dst_stride;
2778     }
2779
2780     _mm_empty ();
2781 }
2782
2783 static void
2784 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2785                         pixman_composite_info_t *info)
2786 {
2787     PIXMAN_COMPOSITE_ARGS (info);
2788     uint8_t *dst_line, *dst;
2789     uint8_t *mask_line, *mask;
2790     int dst_stride, mask_stride;
2791     int32_t w;
2792     uint32_t src;
2793     uint8_t sa;
2794     __m64 vsrc, vsrca;
2795
2796     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2797     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2798
2799     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2800
2801     sa = src >> 24;
2802
2803     vsrc = load8888 (&src);
2804     vsrca = expand_alpha (vsrc);
2805
2806     while (height--)
2807     {
2808         dst = dst_line;
2809         dst_line += dst_stride;
2810         mask = mask_line;
2811         mask_line += mask_stride;
2812         w = width;
2813
2814         while (w && (unsigned long)dst & 7)
2815         {
2816             uint16_t tmp;
2817             uint8_t a;
2818             uint32_t m, d;
2819
2820             a = *mask++;
2821             d = *dst;
2822
2823             m = MUL_UN8 (sa, a, tmp);
2824             d = MUL_UN8 (m, d, tmp);
2825
2826             *dst++ = d;
2827             w--;
2828         }
2829
2830         while (w >= 4)
2831         {
2832             __m64 vmask;
2833             __m64 vdest;
2834
2835             vmask = load8888u ((uint32_t *)mask);
2836             vdest = load8888 ((uint32_t *)dst);
2837
2838             store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2839
2840             dst += 4;
2841             mask += 4;
2842             w -= 4;
2843         }
2844
2845         while (w--)
2846         {
2847             uint16_t tmp;
2848             uint8_t a;
2849             uint32_t m, d;
2850
2851             a = *mask++;
2852             d = *dst;
2853
2854             m = MUL_UN8 (sa, a, tmp);
2855             d = MUL_UN8 (m, d, tmp);
2856
2857             *dst++ = d;
2858         }
2859     }
2860
2861     _mm_empty ();
2862 }
2863
2864 static void
2865 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2866                       pixman_composite_info_t *info)
2867 {
2868     PIXMAN_COMPOSITE_ARGS (info);
2869     uint8_t     *dst_line, *dst;
2870     uint8_t     *src_line, *src;
2871     int src_stride, dst_stride;
2872     int32_t w;
2873
2874     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2875     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2876
2877     while (height--)
2878     {
2879         dst = dst_line;
2880         dst_line += dst_stride;
2881         src = src_line;
2882         src_line += src_stride;
2883         w = width;
2884
2885         while (w && (unsigned long)dst & 3)
2886         {
2887             uint8_t s, d;
2888             uint16_t tmp;
2889
2890             s = *src;
2891             d = *dst;
2892
2893             *dst = MUL_UN8 (s, d, tmp);
2894
2895             src++;
2896             dst++;
2897             w--;
2898         }
2899
2900         while (w >= 4)
2901         {
2902             uint32_t *s = (uint32_t *)src;
2903             uint32_t *d = (uint32_t *)dst;
2904
2905             store8888 (d, in (load8888u (s), load8888 (d)));
2906
2907             w -= 4;
2908             dst += 4;
2909             src += 4;
2910         }
2911
2912         while (w--)
2913         {
2914             uint8_t s, d;
2915             uint16_t tmp;
2916
2917             s = *src;
2918             d = *dst;
2919
2920             *dst = MUL_UN8 (s, d, tmp);
2921
2922             src++;
2923             dst++;
2924         }
2925     }
2926
2927     _mm_empty ();
2928 }
2929
2930 static void
2931 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2932                          pixman_composite_info_t *info)
2933 {
2934     PIXMAN_COMPOSITE_ARGS (info);
2935     uint8_t     *dst_line, *dst;
2936     uint8_t     *mask_line, *mask;
2937     int dst_stride, mask_stride;
2938     int32_t w;
2939     uint32_t src;
2940     uint8_t sa;
2941     __m64 vsrc, vsrca;
2942
2943     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2944     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2945
2946     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2947
2948     sa = src >> 24;
2949
2950     if (src == 0)
2951         return;
2952
2953     vsrc = load8888 (&src);
2954     vsrca = expand_alpha (vsrc);
2955
2956     while (height--)
2957     {
2958         dst = dst_line;
2959         dst_line += dst_stride;
2960         mask = mask_line;
2961         mask_line += mask_stride;
2962         w = width;
2963
2964         while (w && (unsigned long)dst & 3)
2965         {
2966             uint16_t tmp;
2967             uint16_t a;
2968             uint32_t m, d;
2969             uint32_t r;
2970
2971             a = *mask++;
2972             d = *dst;
2973
2974             m = MUL_UN8 (sa, a, tmp);
2975             r = ADD_UN8 (m, d, tmp);
2976
2977             *dst++ = r;
2978             w--;
2979         }
2980
2981         while (w >= 4)
2982         {
2983             __m64 vmask;
2984             __m64 vdest;
2985
2986             vmask = load8888u ((uint32_t *)mask);
2987             vdest = load8888 ((uint32_t *)dst);
2988
2989             store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
2990
2991             dst += 4;
2992             mask += 4;
2993             w -= 4;
2994         }
2995
2996         while (w--)
2997         {
2998             uint16_t tmp;
2999             uint16_t a;
3000             uint32_t m, d;
3001             uint32_t r;
3002
3003             a = *mask++;
3004             d = *dst;
3005
3006             m = MUL_UN8 (sa, a, tmp);
3007             r = ADD_UN8 (m, d, tmp);
3008
3009             *dst++ = r;
3010         }
3011     }
3012
3013     _mm_empty ();
3014 }
3015
3016 static void
3017 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3018                        pixman_composite_info_t *info)
3019 {
3020     PIXMAN_COMPOSITE_ARGS (info);
3021     uint8_t *dst_line, *dst;
3022     uint8_t *src_line, *src;
3023     int dst_stride, src_stride;
3024     int32_t w;
3025     uint8_t s, d;
3026     uint16_t t;
3027
3028     CHECKPOINT ();
3029
3030     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3031     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3032
3033     while (height--)
3034     {
3035         dst = dst_line;
3036         dst_line += dst_stride;
3037         src = src_line;
3038         src_line += src_stride;
3039         w = width;
3040
3041         while (w && (unsigned long)dst & 7)
3042         {
3043             s = *src;
3044             d = *dst;
3045             t = d + s;
3046             s = t | (0 - (t >> 8));
3047             *dst = s;
3048
3049             dst++;
3050             src++;
3051             w--;
3052         }
3053
3054         while (w >= 8)
3055         {
3056             *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3057             dst += 8;
3058             src += 8;
3059             w -= 8;
3060         }
3061
3062         while (w)
3063         {
3064             s = *src;
3065             d = *dst;
3066             t = d + s;
3067             s = t | (0 - (t >> 8));
3068             *dst = s;
3069
3070             dst++;
3071             src++;
3072             w--;
3073         }
3074     }
3075
3076     _mm_empty ();
3077 }
3078
3079 static void
3080 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3081                              pixman_composite_info_t *info)
3082 {
3083     PIXMAN_COMPOSITE_ARGS (info);
3084     uint16_t    *dst_line, *dst;
3085     uint32_t    d;
3086     uint16_t    *src_line, *src;
3087     uint32_t    s;
3088     int dst_stride, src_stride;
3089     int32_t w;
3090
3091     CHECKPOINT ();
3092
3093     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3094     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3095
3096     while (height--)
3097     {
3098         dst = dst_line;
3099         dst_line += dst_stride;
3100         src = src_line;
3101         src_line += src_stride;
3102         w = width;
3103
3104         while (w && (unsigned long)dst & 7)
3105         {
3106             s = *src++;
3107             if (s)
3108             {
3109                 d = *dst;
3110                 s = CONVERT_0565_TO_8888 (s);
3111                 if (d)
3112                 {
3113                     d = CONVERT_0565_TO_8888 (d);
3114                     UN8x4_ADD_UN8x4 (s, d);
3115                 }
3116                 *dst = CONVERT_8888_TO_0565 (s);
3117             }
3118             dst++;
3119             w--;
3120         }
3121
3122         while (w >= 4)
3123         {
3124             __m64 vdest = *(__m64 *)dst;
3125             __m64 vsrc = ldq_u ((__m64 *)src);
3126             __m64 vd0, vd1;
3127             __m64 vs0, vs1;
3128
3129             expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3130             expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3131
3132             vd0 = _mm_adds_pu8 (vd0, vs0);
3133             vd1 = _mm_adds_pu8 (vd1, vs1);
3134
3135             *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3136
3137             dst += 4;
3138             src += 4;
3139             w -= 4;
3140         }
3141
3142         while (w--)
3143         {
3144             s = *src++;
3145             if (s)
3146             {
3147                 d = *dst;
3148                 s = CONVERT_0565_TO_8888 (s);
3149                 if (d)
3150                 {
3151                     d = CONVERT_0565_TO_8888 (d);
3152                     UN8x4_ADD_UN8x4 (s, d);
3153                 }
3154                 *dst = CONVERT_8888_TO_0565 (s);
3155             }
3156             dst++;
3157         }
3158     }
3159
3160     _mm_empty ();
3161 }
3162
3163 static void
3164 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3165                              pixman_composite_info_t *info)
3166 {
3167     PIXMAN_COMPOSITE_ARGS (info);
3168     uint32_t    *dst_line, *dst;
3169     uint32_t    *src_line, *src;
3170     int dst_stride, src_stride;
3171     int32_t w;
3172
3173     CHECKPOINT ();
3174
3175     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3176     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3177
3178     while (height--)
3179     {
3180         dst = dst_line;
3181         dst_line += dst_stride;
3182         src = src_line;
3183         src_line += src_stride;
3184         w = width;
3185
3186         while (w && (unsigned long)dst & 7)
3187         {
3188             store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3189                                       load ((const uint32_t *)dst)));
3190             dst++;
3191             src++;
3192             w--;
3193         }
3194
3195         while (w >= 2)
3196         {
3197             *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3198             dst += 2;
3199             src += 2;
3200             w -= 2;
3201         }
3202
3203         if (w)
3204         {
3205             store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3206                                       load ((const uint32_t *)dst)));
3207
3208         }
3209     }
3210
3211     _mm_empty ();
3212 }
3213
3214 static pixman_bool_t
3215 pixman_blt_mmx (uint32_t *src_bits,
3216                 uint32_t *dst_bits,
3217                 int       src_stride,
3218                 int       dst_stride,
3219                 int       src_bpp,
3220                 int       dst_bpp,
3221                 int       src_x,
3222                 int       src_y,
3223                 int       dest_x,
3224                 int       dest_y,
3225                 int       width,
3226                 int       height)
3227 {
3228     uint8_t *   src_bytes;
3229     uint8_t *   dst_bytes;
3230     int byte_width;
3231
3232     if (src_bpp != dst_bpp)
3233         return FALSE;
3234
3235     if (src_bpp == 16)
3236     {
3237         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3238         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3239         src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3240         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3241         byte_width = 2 * width;
3242         src_stride *= 2;
3243         dst_stride *= 2;
3244     }
3245     else if (src_bpp == 32)
3246     {
3247         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3248         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3249         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3250         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3251         byte_width = 4 * width;
3252         src_stride *= 4;
3253         dst_stride *= 4;
3254     }
3255     else
3256     {
3257         return FALSE;
3258     }
3259
3260     while (height--)
3261     {
3262         int w;
3263         uint8_t *s = src_bytes;
3264         uint8_t *d = dst_bytes;
3265         src_bytes += src_stride;
3266         dst_bytes += dst_stride;
3267         w = byte_width;
3268
3269         if (w >= 1 && ((unsigned long)d & 1))
3270         {
3271             *(uint8_t *)d = *(uint8_t *)s;
3272             w -= 1;
3273             s += 1;
3274             d += 1;
3275         }
3276
3277         if (w >= 2 && ((unsigned long)d & 3))
3278         {
3279             *(uint16_t *)d = *(uint16_t *)s;
3280             w -= 2;
3281             s += 2;
3282             d += 2;
3283         }
3284
3285         while (w >= 4 && ((unsigned long)d & 7))
3286         {
3287             *(uint32_t *)d = ldl_u ((uint32_t *)s);
3288
3289             w -= 4;
3290             s += 4;
3291             d += 4;
3292         }
3293
3294         while (w >= 64)
3295         {
3296 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3297             __asm__ (
3298                 "movq     (%1),   %%mm0\n"
3299                 "movq    8(%1),   %%mm1\n"
3300                 "movq   16(%1),   %%mm2\n"
3301                 "movq   24(%1),   %%mm3\n"
3302                 "movq   32(%1),   %%mm4\n"
3303                 "movq   40(%1),   %%mm5\n"
3304                 "movq   48(%1),   %%mm6\n"
3305                 "movq   56(%1),   %%mm7\n"
3306
3307                 "movq   %%mm0,    (%0)\n"
3308                 "movq   %%mm1,   8(%0)\n"
3309                 "movq   %%mm2,  16(%0)\n"
3310                 "movq   %%mm3,  24(%0)\n"
3311                 "movq   %%mm4,  32(%0)\n"
3312                 "movq   %%mm5,  40(%0)\n"
3313                 "movq   %%mm6,  48(%0)\n"
3314                 "movq   %%mm7,  56(%0)\n"
3315                 :
3316                 : "r" (d), "r" (s)
3317                 : "memory",
3318                   "%mm0", "%mm1", "%mm2", "%mm3",
3319                   "%mm4", "%mm5", "%mm6", "%mm7");
3320 #else
3321             __m64 v0 = ldq_u ((__m64 *)(s + 0));
3322             __m64 v1 = ldq_u ((__m64 *)(s + 8));
3323             __m64 v2 = ldq_u ((__m64 *)(s + 16));
3324             __m64 v3 = ldq_u ((__m64 *)(s + 24));
3325             __m64 v4 = ldq_u ((__m64 *)(s + 32));
3326             __m64 v5 = ldq_u ((__m64 *)(s + 40));
3327             __m64 v6 = ldq_u ((__m64 *)(s + 48));
3328             __m64 v7 = ldq_u ((__m64 *)(s + 56));
3329             *(__m64 *)(d + 0)  = v0;
3330             *(__m64 *)(d + 8)  = v1;
3331             *(__m64 *)(d + 16) = v2;
3332             *(__m64 *)(d + 24) = v3;
3333             *(__m64 *)(d + 32) = v4;
3334             *(__m64 *)(d + 40) = v5;
3335             *(__m64 *)(d + 48) = v6;
3336             *(__m64 *)(d + 56) = v7;
3337 #endif
3338
3339             w -= 64;
3340             s += 64;
3341             d += 64;
3342         }
3343         while (w >= 4)
3344         {
3345             *(uint32_t *)d = ldl_u ((uint32_t *)s);
3346
3347             w -= 4;
3348             s += 4;
3349             d += 4;
3350         }
3351         if (w >= 2)
3352         {
3353             *(uint16_t *)d = *(uint16_t *)s;
3354             w -= 2;
3355             s += 2;
3356             d += 2;
3357         }
3358     }
3359
3360     _mm_empty ();
3361
3362     return TRUE;
3363 }
3364
3365 static void
3366 mmx_composite_copy_area (pixman_implementation_t *imp,
3367                          pixman_composite_info_t *info)
3368 {
3369     PIXMAN_COMPOSITE_ARGS (info);
3370
3371     pixman_blt_mmx (src_image->bits.bits,
3372                     dest_image->bits.bits,
3373                     src_image->bits.rowstride,
3374                     dest_image->bits.rowstride,
3375                     PIXMAN_FORMAT_BPP (src_image->bits.format),
3376                     PIXMAN_FORMAT_BPP (dest_image->bits.format),
3377                     src_x, src_y, dest_x, dest_y, width, height);
3378 }
3379
3380 static void
3381 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3382                                 pixman_composite_info_t *info)
3383 {
3384     PIXMAN_COMPOSITE_ARGS (info);
3385     uint32_t  *src, *src_line;
3386     uint32_t  *dst, *dst_line;
3387     uint8_t  *mask, *mask_line;
3388     int src_stride, mask_stride, dst_stride;
3389     int32_t w;
3390
3391     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3392     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3393     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3394
3395     while (height--)
3396     {
3397         src = src_line;
3398         src_line += src_stride;
3399         dst = dst_line;
3400         dst_line += dst_stride;
3401         mask = mask_line;
3402         mask_line += mask_stride;
3403
3404         w = width;
3405
3406         while (w--)
3407         {
3408             uint64_t m = *mask;
3409
3410             if (m)
3411             {
3412                 uint32_t ssrc = *src | 0xff000000;
3413                 __m64 s = load8888 (&ssrc);
3414
3415                 if (m == 0xff)
3416                 {
3417                     store8888 (dst, s);
3418                 }
3419                 else
3420                 {
3421                     __m64 sa = expand_alpha (s);
3422                     __m64 vm = expand_alpha_rev (to_m64 (m));
3423                     __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3424
3425                     store8888 (dst, vdest);
3426                 }
3427             }
3428
3429             mask++;
3430             dst++;
3431             src++;
3432         }
3433     }
3434
3435     _mm_empty ();
3436 }
3437
3438 static uint32_t *
3439 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3440 {
3441     int w = iter->width;
3442     uint32_t *dst = iter->buffer;
3443     uint32_t *src = (uint32_t *)iter->bits;
3444
3445     iter->bits += iter->stride;
3446
3447     while (w && ((unsigned long)dst) & 7)
3448     {
3449         *dst++ = (*src++) | 0xff000000;
3450         w--;
3451     }
3452
3453     while (w >= 8)
3454     {
3455         __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3456         __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3457         __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3458         __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3459
3460         *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3461         *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3462         *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3463         *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3464
3465         dst += 8;
3466         src += 8;
3467         w -= 8;
3468     }
3469
3470     while (w)
3471     {
3472         *dst++ = (*src++) | 0xff000000;
3473         w--;
3474     }
3475
3476     return iter->buffer;
3477 }
3478
3479 static uint32_t *
3480 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3481 {
3482     int w = iter->width;
3483     uint32_t *dst = iter->buffer;
3484     uint16_t *src = (uint16_t *)iter->bits;
3485
3486     iter->bits += iter->stride;
3487
3488     while (w && ((unsigned long)dst) & 0x0f)
3489     {
3490         uint16_t s = *src++;
3491
3492         *dst++ = CONVERT_0565_TO_8888 (s);
3493         w--;
3494     }
3495
3496     while (w >= 4)
3497     {
3498         __m64 vsrc = ldq_u ((__m64 *)src);
3499         __m64 mm0, mm1;
3500
3501         expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3502
3503         *(__m64 *)(dst + 0) = mm0;
3504         *(__m64 *)(dst + 2) = mm1;
3505
3506         dst += 4;
3507         src += 4;
3508         w -= 4;
3509     }
3510
3511     while (w)
3512     {
3513         uint16_t s = *src++;
3514
3515         *dst++ = CONVERT_0565_TO_8888 (s);
3516         w--;
3517     }
3518
3519     return iter->buffer;
3520 }
3521
3522 static uint32_t *
3523 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3524 {
3525     int w = iter->width;
3526     uint32_t *dst = iter->buffer;
3527     uint8_t *src = iter->bits;
3528
3529     iter->bits += iter->stride;
3530
3531     while (w && (((unsigned long)dst) & 15))
3532     {
3533         *dst++ = *(src++) << 24;
3534         w--;
3535     }
3536
3537     while (w >= 8)
3538     {
3539         __m64 mm0 = ldq_u ((__m64 *)src);
3540
3541         __m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
3542         __m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
3543         __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3544         __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3545         __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3546         __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3547
3548         *(__m64 *)(dst + 0) = mm3;
3549         *(__m64 *)(dst + 2) = mm4;
3550         *(__m64 *)(dst + 4) = mm5;
3551         *(__m64 *)(dst + 6) = mm6;
3552
3553         dst += 8;
3554         src += 8;
3555         w -= 8;
3556     }
3557
3558     while (w)
3559     {
3560         *dst++ = *(src++) << 24;
3561         w--;
3562     }
3563
3564     return iter->buffer;
3565 }
3566
3567 typedef struct
3568 {
3569     pixman_format_code_t        format;
3570     pixman_iter_get_scanline_t  get_scanline;
3571 } fetcher_info_t;
3572
3573 static const fetcher_info_t fetchers[] =
3574 {
3575     { PIXMAN_x8r8g8b8,          mmx_fetch_x8r8g8b8 },
3576     { PIXMAN_r5g6b5,            mmx_fetch_r5g6b5 },
3577     { PIXMAN_a8,                mmx_fetch_a8 },
3578     { PIXMAN_null }
3579 };
3580
3581 static void
3582 mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
3583 {
3584     pixman_image_t *image = iter->image;
3585     int x = iter->x;
3586     int y = iter->y;
3587     int width = iter->width;
3588     int height = iter->height;
3589
3590 #define FLAGS                                                           \
3591     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE)
3592
3593     if ((iter->flags & ITER_NARROW)                             &&
3594         (image->common.flags & FLAGS) == FLAGS                  &&
3595         x >= 0 && y >= 0                                        &&
3596         x + width <= image->bits.width                          &&
3597         y + height <= image->bits.height)
3598     {
3599         const fetcher_info_t *f;
3600
3601         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
3602         {
3603             if (image->common.extended_format_code == f->format)
3604             {
3605                 uint8_t *b = (uint8_t *)image->bits.bits;
3606                 int s = image->bits.rowstride * 4;
3607
3608                 iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
3609                 iter->stride = s;
3610
3611                 iter->get_scanline = f->get_scanline;
3612                 return;
3613             }
3614         }
3615     }
3616
3617     imp->delegate->src_iter_init (imp->delegate, iter);
3618 }
3619
3620 static const pixman_fast_path_t mmx_fast_paths[] =
3621 {
3622     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
3623     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
3624     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
3625     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
3626     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
3627     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
3628     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3629     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3630     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
3631     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3632     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3633     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
3634     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3635     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3636     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
3637     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3638     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3639     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
3640     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
3641     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
3642     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
3643     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
3644     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
3645     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
3646     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
3647     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
3648     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
3649     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
3650     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
3651     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
3652     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
3653     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
3654     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
3655     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
3656     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3657     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3658
3659     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
3660     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
3661     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
3662     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
3663     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
3664     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
3665
3666     PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
3667     PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
3668     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
3669     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
3670     PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8             ),
3671     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
3672
3673     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
3674     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
3675     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
3676     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
3677     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
3678     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
3679     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
3680     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
3681     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
3682     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
3683     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3684     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3685     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3686     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3687     PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
3688     PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
3689
3690     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
3691     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
3692
3693     { PIXMAN_OP_NONE },
3694 };
3695
3696 static pixman_bool_t
3697 mmx_blt (pixman_implementation_t *imp,
3698          uint32_t *               src_bits,
3699          uint32_t *               dst_bits,
3700          int                      src_stride,
3701          int                      dst_stride,
3702          int                      src_bpp,
3703          int                      dst_bpp,
3704          int                      src_x,
3705          int                      src_y,
3706          int                      dest_x,
3707          int                      dest_y,
3708          int                      width,
3709          int                      height)
3710 {
3711     if (!pixman_blt_mmx (
3712             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3713             src_x, src_y, dest_x, dest_y, width, height))
3714
3715     {
3716         return _pixman_implementation_blt (
3717             imp->delegate,
3718             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3719             src_x, src_y, dest_x, dest_y, width, height);
3720     }
3721
3722     return TRUE;
3723 }
3724
3725 static pixman_bool_t
3726 mmx_fill (pixman_implementation_t *imp,
3727           uint32_t *               bits,
3728           int                      stride,
3729           int                      bpp,
3730           int                      x,
3731           int                      y,
3732           int                      width,
3733           int                      height,
3734           uint32_t xor)
3735 {
3736     if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
3737     {
3738         return _pixman_implementation_fill (
3739             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
3740     }
3741
3742     return TRUE;
3743 }
3744
3745 pixman_implementation_t *
3746 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
3747 {
3748     pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
3749
3750     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
3751     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
3752     imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
3753     imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
3754     imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
3755     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
3756     imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
3757     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
3758     imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
3759     imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
3760     imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
3761
3762     imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
3763     imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
3764     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
3765     imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
3766     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
3767     imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
3768     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
3769     imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
3770     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
3771     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
3772     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
3773
3774     imp->blt = mmx_blt;
3775     imp->fill = mmx_fill;
3776
3777     imp->src_iter_init = mmx_src_iter_init;
3778
3779     return imp;
3780 }
3781
3782 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */