2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
29 * Based on work by Owen Taylor
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
49 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
55 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
56 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
64 # if (defined(__SUNPRO_C) || defined(_MSC_VER))
65 # include <xmmintrin.h>
67 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
68 * instructions to be generated that we don't want. Just duplicate the
69 * functions we want to use. */
70 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
71 _mm_movemask_pi8 (__m64 __A)
75 asm ("pmovmskb %1, %0\n\t"
83 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
84 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
86 asm ("pmulhuw %1, %0\n\t"
94 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
99 asm ("pshufw %2, %1, %0\n\t"
101 : "y" (__A), "K" (__N)
107 # define _mm_shuffle_pi16(A, N) \
108 ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
114 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
115 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
118 /* Notes about writing mmx code
120 * give memory operands as the second operand. If you give it as the
121 * first, gcc will first load it into a register, then use that
126 * _mm_mullo_pi16 (x, mmx_constant);
130 * _mm_mullo_pi16 (mmx_constant, x);
132 * Also try to minimize dependencies. i.e. when you need a value, try
133 * to calculate it from a value that was calculated as early as
137 /* --------------- MMX primitives ------------------------------------- */
139 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
140 * the name of the member used to access the data.
141 * If __m64 requires using mm_cvt* intrinsics functions to convert between
142 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
143 * If __m64 and uint64_t values can just be cast to each other directly,
144 * then define USE_M64_CASTS.
145 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
148 # define M64_MEMBER m64_u64
150 # define USE_CVT_INTRINSICS
151 #elif defined(USE_LOONGSON_MMI)
152 # define USE_M64_DOUBLE
153 #elif defined(__GNUC__)
154 # define USE_M64_CASTS
155 #elif defined(__SUNPRO_C)
156 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
157 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
158 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
159 * is defined. If it is used, then the mm_cvt* intrinsics must be used.
161 # define USE_CVT_INTRINSICS
163 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
164 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
166 # define M64_MEMBER l_
170 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
171 typedef uint64_t mmxdatafield;
173 typedef __m64 mmxdatafield;
178 mmxdatafield mmx_4x00ff;
179 mmxdatafield mmx_4x0080;
180 mmxdatafield mmx_565_rgb;
181 mmxdatafield mmx_565_unpack_multiplier;
182 mmxdatafield mmx_565_r;
183 mmxdatafield mmx_565_g;
184 mmxdatafield mmx_565_b;
185 mmxdatafield mmx_mask_0;
186 mmxdatafield mmx_mask_1;
187 mmxdatafield mmx_mask_2;
188 mmxdatafield mmx_mask_3;
189 mmxdatafield mmx_full_alpha;
190 mmxdatafield mmx_4x0101;
193 #if defined(_MSC_VER)
194 # define MMXDATA_INIT(field, val) { val ## UI64 }
195 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
196 # define MMXDATA_INIT(field, val) field = { val ## ULL }
197 #else /* mmxdatafield is an integral type */
198 # define MMXDATA_INIT(field, val) field = val ## ULL
201 static const mmx_data_t c =
203 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
204 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
205 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
206 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
207 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
208 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
209 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
210 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
211 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
212 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
213 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
214 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
215 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
218 #ifdef USE_CVT_INTRINSICS
219 # define MC(x) to_m64 (c.mmx_ ## x)
220 #elif defined(USE_M64_CASTS)
221 # define MC(x) ((__m64)c.mmx_ ## x)
222 #elif defined(USE_M64_DOUBLE)
223 # define MC(x) (*(__m64 *)&c.mmx_ ## x)
225 # define MC(x) c.mmx_ ## x
228 static force_inline __m64
231 #ifdef USE_CVT_INTRINSICS
232 return _mm_cvtsi64_m64 (x);
233 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
238 #elif defined USE_M64_DOUBLE
240 #else /* USE_M64_CASTS */
245 static force_inline uint64_t
248 #ifdef USE_CVT_INTRINSICS
249 return _mm_cvtm64_si64 (x);
250 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
251 uint64_t res = x.M64_MEMBER;
253 #elif defined USE_M64_DOUBLE
254 return *(uint64_t *)&x;
255 #else /* USE_M64_CASTS */
260 static force_inline __m64
265 return _mm_slli_si64 (v, s);
267 return _mm_srli_si64 (v, -s);
272 static force_inline __m64
275 return _mm_xor_si64 (mask, MC (4x00ff));
278 static force_inline __m64
279 pix_multiply (__m64 a, __m64 b)
283 res = _mm_mullo_pi16 (a, b);
284 res = _mm_adds_pu16 (res, MC (4x0080));
285 res = _mm_mulhi_pu16 (res, MC (4x0101));
290 static force_inline __m64
291 pix_add (__m64 a, __m64 b)
293 return _mm_adds_pu8 (a, b);
296 static force_inline __m64
297 expand_alpha (__m64 pixel)
299 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
302 static force_inline __m64
303 expand_alpha_rev (__m64 pixel)
305 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
308 static force_inline __m64
309 invert_colors (__m64 pixel)
311 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
314 static force_inline __m64
319 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
322 static force_inline __m64
323 over_rev_non_pre (__m64 src, __m64 dest)
325 __m64 srca = expand_alpha (src);
326 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
328 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
331 static force_inline __m64
332 in (__m64 src, __m64 mask)
334 return pix_multiply (src, mask);
338 static force_inline __m64
339 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
341 return over (in (src, mask), pix_multiply (srca, mask), dest);
346 #define in_over(src, srca, mask, dest) \
347 over (in (src, mask), pix_multiply (srca, mask), dest)
351 /* Elemental unaligned loads */
353 static force_inline __m64 ldq_u(__m64 *p)
356 /* x86's alignment restrictions are very relaxed. */
358 #elif defined USE_ARM_IWMMXT
359 int align = (uintptr_t)p & 7;
363 aligned_p = (__m64 *)((uintptr_t)p & ~7);
364 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
366 struct __una_u64 { __m64 x __attribute__((packed)); };
367 const struct __una_u64 *ptr = (const struct __una_u64 *) p;
368 return (__m64) ptr->x;
372 static force_inline uint32_t ldl_u(const uint32_t *p)
375 /* x86's alignment restrictions are very relaxed. */
378 struct __una_u32 { uint32_t x __attribute__((packed)); };
379 const struct __una_u32 *ptr = (const struct __una_u32 *) p;
384 static force_inline __m64
385 load (const uint32_t *v)
387 #ifdef USE_LOONGSON_MMI
389 asm ("lwc1 %0, %1\n\t"
395 return _mm_cvtsi32_si64 (*v);
399 static force_inline __m64
400 load8888 (const uint32_t *v)
402 #ifdef USE_LOONGSON_MMI
403 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
405 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
409 static force_inline __m64
410 load8888u (const uint32_t *v)
412 uint32_t l = ldl_u (v);
413 return load8888 (&l);
416 static force_inline __m64
417 pack8888 (__m64 lo, __m64 hi)
419 return _mm_packs_pu16 (lo, hi);
422 static force_inline void
423 store (uint32_t *dest, __m64 v)
425 #ifdef USE_LOONGSON_MMI
426 asm ("swc1 %1, %0\n\t"
432 *dest = _mm_cvtsi64_si32 (v);
436 static force_inline void
437 store8888 (uint32_t *dest, __m64 v)
439 v = pack8888 (v, _mm_setzero_si64 ());
443 static force_inline pixman_bool_t
444 is_equal (__m64 a, __m64 b)
446 #ifdef USE_LOONGSON_MMI
447 /* __m64 is double, we can compare directly. */
450 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
454 static force_inline pixman_bool_t
457 #ifdef USE_LOONGSON_MMI
458 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
460 __m64 ffs = _mm_cmpeq_pi8 (v, v);
461 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
465 static force_inline pixman_bool_t
468 return is_equal (v, _mm_setzero_si64 ());
471 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
475 * --- Expanding 565 in the low word ---
477 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
478 * m = m & (01f0003f001f);
479 * m = m * (008404100840);
482 * Note the trick here - the top word is shifted by another nibble to
483 * avoid it bumping into the middle word
485 static force_inline __m64
486 expand565 (__m64 pixel, int pos)
491 /* move pixel to low 16 bit and zero the rest */
492 p = shift (shift (p, (3 - pos) * 16), -48);
494 t1 = shift (p, 36 - 11);
495 t2 = shift (p, 16 - 5);
497 p = _mm_or_si64 (t1, p);
498 p = _mm_or_si64 (t2, p);
499 p = _mm_and_si64 (p, MC (565_rgb));
501 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
502 return _mm_srli_pi16 (pixel, 8);
505 static force_inline __m64
506 expand8888 (__m64 in, int pos)
509 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
511 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
514 static force_inline __m64
515 expandx888 (__m64 in, int pos)
517 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
520 static force_inline __m64
521 pack_565 (__m64 pixel, __m64 target, int pos)
527 r = _mm_and_si64 (p, MC (565_r));
528 g = _mm_and_si64 (p, MC (565_g));
529 b = _mm_and_si64 (p, MC (565_b));
531 r = shift (r, -(32 - 8) + pos * 16);
532 g = shift (g, -(16 - 3) + pos * 16);
533 b = shift (b, -(0 + 3) + pos * 16);
536 t = _mm_and_si64 (t, MC (mask_0));
538 t = _mm_and_si64 (t, MC (mask_1));
540 t = _mm_and_si64 (t, MC (mask_2));
542 t = _mm_and_si64 (t, MC (mask_3));
544 p = _mm_or_si64 (r, t);
545 p = _mm_or_si64 (g, p);
547 return _mm_or_si64 (b, p);
552 static force_inline __m64
553 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
555 x = pix_multiply (x, a);
556 y = pix_multiply (y, b);
558 return pix_add (x, y);
563 #define pix_add_mul(x, a, y, b) \
564 ( x = pix_multiply (x, a), \
565 y = pix_multiply (y, b), \
570 /* --------------- MMX code patch for fbcompose.c --------------------- */
572 static force_inline uint32_t
573 combine (const uint32_t *src, const uint32_t *mask)
575 uint32_t ssrc = *src;
579 __m64 m = load8888 (mask);
580 __m64 s = load8888 (&ssrc);
582 m = expand_alpha (m);
583 s = pix_multiply (s, m);
585 store8888 (&ssrc, s);
592 mmx_combine_over_u (pixman_implementation_t *imp,
595 const uint32_t * src,
596 const uint32_t * mask,
599 const uint32_t *end = dest + width;
603 uint32_t ssrc = combine (src, mask);
604 uint32_t a = ssrc >> 24;
613 s = load8888 (&ssrc);
614 sa = expand_alpha (s);
615 store8888 (dest, over (s, sa, load8888 (dest)));
627 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
630 const uint32_t * src,
631 const uint32_t * mask,
634 const uint32_t *end = dest + width;
639 uint32_t s = combine (src, mask);
642 da = expand_alpha (d);
643 store8888 (dest, over (d, da, load8888 (&s)));
654 mmx_combine_in_u (pixman_implementation_t *imp,
657 const uint32_t * src,
658 const uint32_t * mask,
661 const uint32_t *end = dest + width;
666 uint32_t ssrc = combine (src, mask);
668 x = load8888 (&ssrc);
670 a = expand_alpha (a);
671 x = pix_multiply (x, a);
684 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
687 const uint32_t * src,
688 const uint32_t * mask,
691 const uint32_t *end = dest + width;
696 uint32_t ssrc = combine (src, mask);
699 a = load8888 (&ssrc);
700 a = expand_alpha (a);
701 x = pix_multiply (x, a);
713 mmx_combine_out_u (pixman_implementation_t *imp,
716 const uint32_t * src,
717 const uint32_t * mask,
720 const uint32_t *end = dest + width;
725 uint32_t ssrc = combine (src, mask);
727 x = load8888 (&ssrc);
729 a = expand_alpha (a);
731 x = pix_multiply (x, a);
743 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
746 const uint32_t * src,
747 const uint32_t * mask,
750 const uint32_t *end = dest + width;
755 uint32_t ssrc = combine (src, mask);
758 a = load8888 (&ssrc);
759 a = expand_alpha (a);
761 x = pix_multiply (x, a);
774 mmx_combine_atop_u (pixman_implementation_t *imp,
777 const uint32_t * src,
778 const uint32_t * mask,
781 const uint32_t *end = dest + width;
786 uint32_t ssrc = combine (src, mask);
788 s = load8888 (&ssrc);
790 sia = expand_alpha (s);
792 da = expand_alpha (d);
793 s = pix_add_mul (s, da, d, sia);
805 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
808 const uint32_t * src,
809 const uint32_t * mask,
819 uint32_t ssrc = combine (src, mask);
821 s = load8888 (&ssrc);
823 sa = expand_alpha (s);
824 dia = expand_alpha (d);
826 s = pix_add_mul (s, dia, d, sa);
838 mmx_combine_xor_u (pixman_implementation_t *imp,
841 const uint32_t * src,
842 const uint32_t * mask,
845 const uint32_t *end = dest + width;
849 __m64 s, dia, d, sia;
850 uint32_t ssrc = combine (src, mask);
852 s = load8888 (&ssrc);
854 sia = expand_alpha (s);
855 dia = expand_alpha (d);
858 s = pix_add_mul (s, dia, d, sia);
870 mmx_combine_add_u (pixman_implementation_t *imp,
873 const uint32_t * src,
874 const uint32_t * mask,
877 const uint32_t *end = dest + width;
882 uint32_t ssrc = combine (src, mask);
884 s = load8888 (&ssrc);
898 mmx_combine_saturate_u (pixman_implementation_t *imp,
901 const uint32_t * src,
902 const uint32_t * mask,
905 const uint32_t *end = dest + width;
909 uint32_t s = combine (src, mask);
911 __m64 ms = load8888 (&s);
912 __m64 md = load8888 (&d);
913 uint32_t sa = s >> 24;
914 uint32_t da = ~d >> 24;
918 uint32_t quot = DIV_UN8 (da, sa) << 24;
919 __m64 msa = load8888 (");
920 msa = expand_alpha (msa);
921 ms = pix_multiply (ms, msa);
924 md = pix_add (md, ms);
925 store8888 (dest, md);
936 mmx_combine_src_ca (pixman_implementation_t *imp,
939 const uint32_t * src,
940 const uint32_t * mask,
943 const uint32_t *end = src + width;
947 __m64 a = load8888 (mask);
948 __m64 s = load8888 (src);
950 s = pix_multiply (s, a);
961 mmx_combine_over_ca (pixman_implementation_t *imp,
964 const uint32_t * src,
965 const uint32_t * mask,
968 const uint32_t *end = src + width;
972 __m64 a = load8888 (mask);
973 __m64 s = load8888 (src);
974 __m64 d = load8888 (dest);
975 __m64 sa = expand_alpha (s);
977 store8888 (dest, in_over (s, sa, a, d));
987 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
990 const uint32_t * src,
991 const uint32_t * mask,
994 const uint32_t *end = src + width;
998 __m64 a = load8888 (mask);
999 __m64 s = load8888 (src);
1000 __m64 d = load8888 (dest);
1001 __m64 da = expand_alpha (d);
1003 store8888 (dest, over (d, da, in (s, a)));
1013 mmx_combine_in_ca (pixman_implementation_t *imp,
1016 const uint32_t * src,
1017 const uint32_t * mask,
1020 const uint32_t *end = src + width;
1024 __m64 a = load8888 (mask);
1025 __m64 s = load8888 (src);
1026 __m64 d = load8888 (dest);
1027 __m64 da = expand_alpha (d);
1029 s = pix_multiply (s, a);
1030 s = pix_multiply (s, da);
1031 store8888 (dest, s);
1041 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1044 const uint32_t * src,
1045 const uint32_t * mask,
1048 const uint32_t *end = src + width;
1052 __m64 a = load8888 (mask);
1053 __m64 s = load8888 (src);
1054 __m64 d = load8888 (dest);
1055 __m64 sa = expand_alpha (s);
1057 a = pix_multiply (a, sa);
1058 d = pix_multiply (d, a);
1059 store8888 (dest, d);
1069 mmx_combine_out_ca (pixman_implementation_t *imp,
1072 const uint32_t * src,
1073 const uint32_t * mask,
1076 const uint32_t *end = src + width;
1080 __m64 a = load8888 (mask);
1081 __m64 s = load8888 (src);
1082 __m64 d = load8888 (dest);
1083 __m64 da = expand_alpha (d);
1086 s = pix_multiply (s, a);
1087 s = pix_multiply (s, da);
1088 store8888 (dest, s);
1098 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1101 const uint32_t * src,
1102 const uint32_t * mask,
1105 const uint32_t *end = src + width;
1109 __m64 a = load8888 (mask);
1110 __m64 s = load8888 (src);
1111 __m64 d = load8888 (dest);
1112 __m64 sa = expand_alpha (s);
1114 a = pix_multiply (a, sa);
1116 d = pix_multiply (d, a);
1117 store8888 (dest, d);
1127 mmx_combine_atop_ca (pixman_implementation_t *imp,
1130 const uint32_t * src,
1131 const uint32_t * mask,
1134 const uint32_t *end = src + width;
1138 __m64 a = load8888 (mask);
1139 __m64 s = load8888 (src);
1140 __m64 d = load8888 (dest);
1141 __m64 da = expand_alpha (d);
1142 __m64 sa = expand_alpha (s);
1144 s = pix_multiply (s, a);
1145 a = pix_multiply (a, sa);
1147 d = pix_add_mul (d, a, s, da);
1148 store8888 (dest, d);
1158 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1161 const uint32_t * src,
1162 const uint32_t * mask,
1165 const uint32_t *end = src + width;
1169 __m64 a = load8888 (mask);
1170 __m64 s = load8888 (src);
1171 __m64 d = load8888 (dest);
1172 __m64 da = expand_alpha (d);
1173 __m64 sa = expand_alpha (s);
1175 s = pix_multiply (s, a);
1176 a = pix_multiply (a, sa);
1178 d = pix_add_mul (d, a, s, da);
1179 store8888 (dest, d);
1189 mmx_combine_xor_ca (pixman_implementation_t *imp,
1192 const uint32_t * src,
1193 const uint32_t * mask,
1196 const uint32_t *end = src + width;
1200 __m64 a = load8888 (mask);
1201 __m64 s = load8888 (src);
1202 __m64 d = load8888 (dest);
1203 __m64 da = expand_alpha (d);
1204 __m64 sa = expand_alpha (s);
1206 s = pix_multiply (s, a);
1207 a = pix_multiply (a, sa);
1210 d = pix_add_mul (d, a, s, da);
1211 store8888 (dest, d);
1221 mmx_combine_add_ca (pixman_implementation_t *imp,
1224 const uint32_t * src,
1225 const uint32_t * mask,
1228 const uint32_t *end = src + width;
1232 __m64 a = load8888 (mask);
1233 __m64 s = load8888 (src);
1234 __m64 d = load8888 (dest);
1236 s = pix_multiply (s, a);
1238 store8888 (dest, d);
1247 /* ------------- MMX code paths called from fbpict.c -------------------- */
1250 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1251 pixman_composite_info_t *info)
1253 PIXMAN_COMPOSITE_ARGS (info);
1255 uint32_t *dst_line, *dst;
1262 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1267 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1269 vsrc = load8888 (&src);
1270 vsrca = expand_alpha (vsrc);
1275 dst_line += dst_stride;
1280 while (w && (unsigned long)dst & 7)
1282 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1293 vdest = *(__m64 *)dst;
1295 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1296 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1298 *(__m64 *)dst = pack8888 (dest0, dest1);
1308 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1316 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1317 pixman_composite_info_t *info)
1319 PIXMAN_COMPOSITE_ARGS (info);
1321 uint16_t *dst_line, *dst;
1328 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1333 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1335 vsrc = load8888 (&src);
1336 vsrca = expand_alpha (vsrc);
1341 dst_line += dst_stride;
1346 while (w && (unsigned long)dst & 7)
1349 __m64 vdest = expand565 (to_m64 (d), 0);
1351 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1352 *dst = to_uint64 (vdest);
1362 vdest = *(__m64 *)dst;
1364 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
1365 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
1366 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
1367 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
1369 *(__m64 *)dst = vdest;
1380 __m64 vdest = expand565 (to_m64 (d), 0);
1382 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1383 *dst = to_uint64 (vdest);
1394 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1395 pixman_composite_info_t *info)
1397 PIXMAN_COMPOSITE_ARGS (info);
1400 uint32_t *mask_line;
1401 int dst_stride, mask_stride;
1406 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1411 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1412 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1414 vsrc = load8888 (&src);
1415 vsrca = expand_alpha (vsrc);
1420 uint32_t *p = (uint32_t *)mask_line;
1421 uint32_t *q = (uint32_t *)dst_line;
1423 while (twidth && (unsigned long)q & 7)
1425 uint32_t m = *(uint32_t *)p;
1429 __m64 vdest = load8888 (q);
1430 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1431 store8888 (q, vdest);
1448 __m64 vdest = *(__m64 *)q;
1450 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1451 expand8888 (vdest, 0));
1452 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1453 expand8888 (vdest, 1));
1455 *(__m64 *)q = pack8888 (dest0, dest1);
1465 uint32_t m = *(uint32_t *)p;
1469 __m64 vdest = load8888 (q);
1470 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1471 store8888 (q, vdest);
1479 dst_line += dst_stride;
1480 mask_line += mask_stride;
1487 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1488 pixman_composite_info_t *info)
1490 PIXMAN_COMPOSITE_ARGS (info);
1491 uint32_t *dst_line, *dst;
1492 uint32_t *src_line, *src;
1495 int dst_stride, src_stride;
1500 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1501 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1503 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1505 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1506 vmask = load8888 (&mask);
1511 dst_line += dst_stride;
1513 src_line += src_stride;
1516 while (w && (unsigned long)dst & 7)
1518 __m64 s = load8888 (src);
1519 __m64 d = load8888 (dst);
1521 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1530 __m64 vs = ldq_u ((__m64 *)src);
1531 __m64 vd = *(__m64 *)dst;
1532 __m64 vsrc0 = expand8888 (vs, 0);
1533 __m64 vsrc1 = expand8888 (vs, 1);
1535 *(__m64 *)dst = pack8888 (
1536 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1537 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1546 __m64 s = load8888 (src);
1547 __m64 d = load8888 (dst);
1549 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1557 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1558 pixman_composite_info_t *info)
1560 PIXMAN_COMPOSITE_ARGS (info);
1561 uint32_t *dst_line, *dst;
1562 uint32_t *src_line, *src;
1565 int dst_stride, src_stride;
1571 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1572 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1573 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1576 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1577 vmask = load8888 (&mask);
1583 dst_line += dst_stride;
1585 src_line += src_stride;
1588 while (w && (unsigned long)dst & 7)
1590 uint32_t ssrc = *src | 0xff000000;
1591 __m64 s = load8888 (&ssrc);
1592 __m64 d = load8888 (dst);
1594 store8888 (dst, in_over (s, srca, vmask, d));
1603 __m64 vd0 = *(__m64 *)(dst + 0);
1604 __m64 vd1 = *(__m64 *)(dst + 2);
1605 __m64 vd2 = *(__m64 *)(dst + 4);
1606 __m64 vd3 = *(__m64 *)(dst + 6);
1607 __m64 vd4 = *(__m64 *)(dst + 8);
1608 __m64 vd5 = *(__m64 *)(dst + 10);
1609 __m64 vd6 = *(__m64 *)(dst + 12);
1610 __m64 vd7 = *(__m64 *)(dst + 14);
1612 __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1613 __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1614 __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1615 __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1616 __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1617 __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1618 __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1619 __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1622 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1623 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1626 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1627 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1630 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1631 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1634 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1635 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1638 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1639 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1642 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1643 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1646 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1647 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1650 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1651 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1653 *(__m64 *)(dst + 0) = vd0;
1654 *(__m64 *)(dst + 2) = vd1;
1655 *(__m64 *)(dst + 4) = vd2;
1656 *(__m64 *)(dst + 6) = vd3;
1657 *(__m64 *)(dst + 8) = vd4;
1658 *(__m64 *)(dst + 10) = vd5;
1659 *(__m64 *)(dst + 12) = vd6;
1660 *(__m64 *)(dst + 14) = vd7;
1669 uint32_t ssrc = *src | 0xff000000;
1670 __m64 s = load8888 (&ssrc);
1671 __m64 d = load8888 (dst);
1673 store8888 (dst, in_over (s, srca, vmask, d));
1685 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1686 pixman_composite_info_t *info)
1688 PIXMAN_COMPOSITE_ARGS (info);
1689 uint32_t *dst_line, *dst;
1690 uint32_t *src_line, *src;
1692 int dst_stride, src_stride;
1698 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1699 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1704 dst_line += dst_stride;
1706 src_line += src_stride;
1722 sa = expand_alpha (ms);
1723 store8888 (dst, over (ms, sa, load8888 (dst)));
1733 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1734 pixman_composite_info_t *info)
1736 PIXMAN_COMPOSITE_ARGS (info);
1737 uint16_t *dst_line, *dst;
1738 uint32_t *src_line, *src;
1739 int dst_stride, src_stride;
1744 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1745 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1749 assert (src_image->drawable == mask_image->drawable);
1755 dst_line += dst_stride;
1757 src_line += src_stride;
1762 while (w && (unsigned long)dst & 7)
1764 __m64 vsrc = load8888 (src);
1766 __m64 vdest = expand565 (to_m64 (d), 0);
1769 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1771 *dst = to_uint64 (vdest);
1782 __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1785 vsrc0 = load8888 ((src + 0));
1786 vsrc1 = load8888 ((src + 1));
1787 vsrc2 = load8888 ((src + 2));
1788 vsrc3 = load8888 ((src + 3));
1790 vdest = *(__m64 *)dst;
1792 vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
1793 vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
1794 vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
1795 vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
1797 *(__m64 *)dst = vdest;
1808 __m64 vsrc = load8888 (src);
1810 __m64 vdest = expand565 (to_m64 (d), 0);
1812 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1814 *dst = to_uint64 (vdest);
1826 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1827 pixman_composite_info_t *info)
1829 PIXMAN_COMPOSITE_ARGS (info);
1831 uint32_t *dst_line, *dst;
1832 uint8_t *mask_line, *mask;
1833 int dst_stride, mask_stride;
1840 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1846 srcsrc = (uint64_t)src << 32 | src;
1848 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1849 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1851 vsrc = load8888 (&src);
1852 vsrca = expand_alpha (vsrc);
1857 dst_line += dst_stride;
1859 mask_line += mask_stride;
1864 while (w && (unsigned long)dst & 7)
1870 __m64 vdest = in_over (vsrc, vsrca,
1871 expand_alpha_rev (to_m64 (m)),
1874 store8888 (dst, vdest);
1891 if (srca == 0xff && (m0 & m1) == 0xff)
1893 *(uint64_t *)dst = srcsrc;
1900 vdest = *(__m64 *)dst;
1902 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
1903 expand8888 (vdest, 0));
1904 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
1905 expand8888 (vdest, 1));
1907 *(__m64 *)dst = pack8888 (dest0, dest1);
1923 __m64 vdest = load8888 (dst);
1926 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
1927 store8888 (dst, vdest);
1936 pixman_fill_mmx (uint32_t *bits,
1947 uint32_t byte_width;
1950 #if defined __GNUC__ && defined USE_X86_MMX
1951 __m64 v1, v2, v3, v4, v5, v6, v7;
1954 if (bpp != 16 && bpp != 32 && bpp != 8)
1959 stride = stride * (int) sizeof (uint32_t) / 1;
1960 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
1963 xor = (xor & 0xff) * 0x01010101;
1967 stride = stride * (int) sizeof (uint32_t) / 2;
1968 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
1969 byte_width = 2 * width;
1971 xor = (xor & 0xffff) * 0x00010001;
1975 stride = stride * (int) sizeof (uint32_t) / 4;
1976 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
1977 byte_width = 4 * width;
1981 fill = ((uint64_t)xor << 32) | xor;
1982 vfill = to_m64 (fill);
1984 #if defined __GNUC__ && defined USE_X86_MMX
1993 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
1994 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2001 uint8_t *d = byte_line;
2003 byte_line += stride;
2006 if (w >= 1 && ((unsigned long)d & 1))
2008 *(uint8_t *)d = (xor & 0xff);
2013 if (w >= 2 && ((unsigned long)d & 3))
2015 *(uint16_t *)d = xor;
2020 while (w >= 4 && ((unsigned long)d & 7))
2022 *(uint32_t *)d = xor;
2030 #if defined __GNUC__ && defined USE_X86_MMX
2042 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2043 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2046 *(__m64*) (d + 0) = vfill;
2047 *(__m64*) (d + 8) = vfill;
2048 *(__m64*) (d + 16) = vfill;
2049 *(__m64*) (d + 24) = vfill;
2050 *(__m64*) (d + 32) = vfill;
2051 *(__m64*) (d + 40) = vfill;
2052 *(__m64*) (d + 48) = vfill;
2053 *(__m64*) (d + 56) = vfill;
2061 *(uint32_t *)d = xor;
2068 *(uint16_t *)d = xor;
2074 *(uint8_t *)d = (xor & 0xff);
2086 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2087 pixman_composite_info_t *info)
2089 PIXMAN_COMPOSITE_ARGS (info);
2091 uint32_t *dst_line, *dst;
2092 uint8_t *mask_line, *mask;
2093 int dst_stride, mask_stride;
2100 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2105 pixman_fill_mmx (dest_image->bits.bits, dest_image->bits.rowstride,
2106 PIXMAN_FORMAT_BPP (dest_image->bits.format),
2107 dest_x, dest_y, width, height, 0);
2111 srcsrc = (uint64_t)src << 32 | src;
2113 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2114 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2116 vsrc = load8888 (&src);
2121 dst_line += dst_stride;
2123 mask_line += mask_stride;
2128 while (w && (unsigned long)dst & 7)
2134 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2136 store8888 (dst, vdest);
2156 if (srca == 0xff && (m0 & m1) == 0xff)
2158 *(uint64_t *)dst = srcsrc;
2164 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2165 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2167 *(__m64 *)dst = pack8888 (dest0, dest1);
2171 *(uint64_t *)dst = 0;
2187 __m64 vdest = load8888 (dst);
2189 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2190 store8888 (dst, vdest);
2203 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2204 pixman_composite_info_t *info)
2206 PIXMAN_COMPOSITE_ARGS (info);
2208 uint16_t *dst_line, *dst;
2209 uint8_t *mask_line, *mask;
2210 int dst_stride, mask_stride;
2212 __m64 vsrc, vsrca, tmp;
2217 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2223 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2224 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2226 vsrc = load8888 (&src);
2227 vsrca = expand_alpha (vsrc);
2229 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2230 srcsrcsrcsrc = expand_alpha_rev (tmp);
2235 dst_line += dst_stride;
2237 mask_line += mask_stride;
2242 while (w && (unsigned long)dst & 7)
2249 __m64 vd = to_m64 (d);
2250 __m64 vdest = in_over (
2251 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2253 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2254 *dst = to_uint64 (vd);
2266 uint64_t m0, m1, m2, m3;
2272 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2274 *(__m64 *)dst = srcsrcsrcsrc;
2276 else if (m0 | m1 | m2 | m3)
2279 __m64 vm0, vm1, vm2, vm3;
2281 vdest = *(__m64 *)dst;
2284 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
2285 expand565 (vdest, 0)), vdest, 0);
2287 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
2288 expand565 (vdest, 1)), vdest, 1);
2290 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
2291 expand565 (vdest, 2)), vdest, 2);
2293 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
2294 expand565 (vdest, 3)), vdest, 3);
2296 *(__m64 *)dst = vdest;
2313 __m64 vd = to_m64 (d);
2314 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2316 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2317 *dst = to_uint64 (vd);
2330 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2331 pixman_composite_info_t *info)
2333 PIXMAN_COMPOSITE_ARGS (info);
2334 uint16_t *dst_line, *dst;
2335 uint32_t *src_line, *src;
2336 int dst_stride, src_stride;
2341 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2342 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2346 assert (src_image->drawable == mask_image->drawable);
2352 dst_line += dst_stride;
2354 src_line += src_stride;
2359 while (w && (unsigned long)dst & 7)
2361 __m64 vsrc = load8888 (src);
2363 __m64 vdest = expand565 (to_m64 (d), 0);
2365 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2367 *dst = to_uint64 (vdest);
2378 uint32_t s0, s1, s2, s3;
2379 unsigned char a0, a1, a2, a3;
2391 if ((a0 & a1 & a2 & a3) == 0xFF)
2394 vdest = pack_565 (invert_colors (load8888 (&s0)), _mm_setzero_si64 (), 0);
2395 vdest = pack_565 (invert_colors (load8888 (&s1)), vdest, 1);
2396 vdest = pack_565 (invert_colors (load8888 (&s2)), vdest, 2);
2397 vdest = pack_565 (invert_colors (load8888 (&s3)), vdest, 3);
2399 *(__m64 *)dst = vdest;
2401 else if (s0 | s1 | s2 | s3)
2403 __m64 vdest = *(__m64 *)dst;
2405 vdest = pack_565 (over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)), vdest, 0);
2406 vdest = pack_565 (over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)), vdest, 1);
2407 vdest = pack_565 (over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)), vdest, 2);
2408 vdest = pack_565 (over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)), vdest, 3);
2410 *(__m64 *)dst = vdest;
2422 __m64 vsrc = load8888 (src);
2424 __m64 vdest = expand565 (to_m64 (d), 0);
2426 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2428 *dst = to_uint64 (vdest);
2440 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2441 pixman_composite_info_t *info)
2443 PIXMAN_COMPOSITE_ARGS (info);
2444 uint32_t *dst_line, *dst;
2445 uint32_t *src_line, *src;
2446 int dst_stride, src_stride;
2451 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2452 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2456 assert (src_image->drawable == mask_image->drawable);
2462 dst_line += dst_stride;
2464 src_line += src_stride;
2467 while (w && (unsigned long)dst & 7)
2469 __m64 s = load8888 (src);
2470 __m64 d = load8888 (dst);
2472 store8888 (dst, over_rev_non_pre (s, d));
2482 unsigned char a0, a1;
2491 if ((a0 & a1) == 0xFF)
2493 d0 = invert_colors (load8888 (&s0));
2494 d1 = invert_colors (load8888 (&s1));
2496 *(__m64 *)dst = pack8888 (d0, d1);
2500 __m64 vdest = *(__m64 *)dst;
2502 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2503 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2505 *(__m64 *)dst = pack8888 (d0, d1);
2515 __m64 s = load8888 (src);
2516 __m64 d = load8888 (dst);
2518 store8888 (dst, over_rev_non_pre (s, d));
2526 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2527 pixman_composite_info_t *info)
2529 PIXMAN_COMPOSITE_ARGS (info);
2532 uint32_t *mask_line;
2533 int dst_stride, mask_stride;
2538 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2543 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2544 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2546 vsrc = load8888 (&src);
2547 vsrca = expand_alpha (vsrc);
2552 uint32_t *p = (uint32_t *)mask_line;
2553 uint16_t *q = (uint16_t *)dst_line;
2555 while (twidth && ((unsigned long)q & 7))
2557 uint32_t m = *(uint32_t *)p;
2562 __m64 vdest = expand565 (to_m64 (d), 0);
2563 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2564 *q = to_uint64 (vdest);
2574 uint32_t m0, m1, m2, m3;
2581 if ((m0 | m1 | m2 | m3))
2583 __m64 vdest = *(__m64 *)q;
2585 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)), vdest, 0);
2586 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)), vdest, 1);
2587 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)), vdest, 2);
2588 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)), vdest, 3);
2590 *(__m64 *)q = vdest;
2605 __m64 vdest = expand565 (to_m64 (d), 0);
2606 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2607 *q = to_uint64 (vdest);
2615 mask_line += mask_stride;
2616 dst_line += dst_stride;
2623 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2624 pixman_composite_info_t *info)
2626 PIXMAN_COMPOSITE_ARGS (info);
2627 uint8_t *dst_line, *dst;
2628 uint8_t *mask_line, *mask;
2629 int dst_stride, mask_stride;
2635 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2636 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2638 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2642 vsrc = load8888 (&src);
2643 vsrca = expand_alpha (vsrc);
2648 dst_line += dst_stride;
2650 mask_line += mask_stride;
2653 while (w && (unsigned long)dst & 7)
2662 m = MUL_UN8 (sa, a, tmp);
2663 d = MUL_UN8 (m, d, tmp);
2674 vmask = load8888u ((uint32_t *)mask);
2675 vdest = load8888 ((uint32_t *)dst);
2677 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2693 m = MUL_UN8 (sa, a, tmp);
2694 d = MUL_UN8 (m, d, tmp);
2704 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2705 pixman_composite_info_t *info)
2707 PIXMAN_COMPOSITE_ARGS (info);
2708 uint8_t *dst_line, *dst;
2709 uint8_t *src_line, *src;
2710 int src_stride, dst_stride;
2713 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2714 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2719 dst_line += dst_stride;
2721 src_line += src_stride;
2724 while (w && (unsigned long)dst & 3)
2732 *dst = MUL_UN8 (s, d, tmp);
2741 uint32_t *s = (uint32_t *)src;
2742 uint32_t *d = (uint32_t *)dst;
2744 store8888 (d, in (load8888u (s), load8888 (d)));
2759 *dst = MUL_UN8 (s, d, tmp);
2770 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2771 pixman_composite_info_t *info)
2773 PIXMAN_COMPOSITE_ARGS (info);
2774 uint8_t *dst_line, *dst;
2775 uint8_t *mask_line, *mask;
2776 int dst_stride, mask_stride;
2782 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2783 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2785 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2792 vsrc = load8888 (&src);
2793 vsrca = expand_alpha (vsrc);
2798 dst_line += dst_stride;
2800 mask_line += mask_stride;
2803 while (w && (unsigned long)dst & 3)
2813 m = MUL_UN8 (sa, a, tmp);
2814 r = ADD_UN8 (m, d, tmp);
2825 vmask = load8888u ((uint32_t *)mask);
2826 vdest = load8888 ((uint32_t *)dst);
2828 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
2845 m = MUL_UN8 (sa, a, tmp);
2846 r = ADD_UN8 (m, d, tmp);
2856 mmx_composite_add_8_8 (pixman_implementation_t *imp,
2857 pixman_composite_info_t *info)
2859 PIXMAN_COMPOSITE_ARGS (info);
2860 uint8_t *dst_line, *dst;
2861 uint8_t *src_line, *src;
2862 int dst_stride, src_stride;
2869 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2870 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2875 dst_line += dst_stride;
2877 src_line += src_stride;
2880 while (w && (unsigned long)dst & 7)
2885 s = t | (0 - (t >> 8));
2895 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
2906 s = t | (0 - (t >> 8));
2919 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2920 pixman_composite_info_t *info)
2922 PIXMAN_COMPOSITE_ARGS (info);
2923 uint32_t *dst_line, *dst;
2924 uint32_t *src_line, *src;
2925 int dst_stride, src_stride;
2930 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2931 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2936 dst_line += dst_stride;
2938 src_line += src_stride;
2941 while (w && (unsigned long)dst & 7)
2943 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
2944 load ((const uint32_t *)dst)));
2952 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
2960 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
2961 load ((const uint32_t *)dst)));
2969 static pixman_bool_t
2970 pixman_blt_mmx (uint32_t *src_bits,
2983 uint8_t * src_bytes;
2984 uint8_t * dst_bytes;
2987 if (src_bpp != dst_bpp)
2992 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
2993 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
2994 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
2995 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
2996 byte_width = 2 * width;
3000 else if (src_bpp == 32)
3002 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3003 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3004 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3005 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3006 byte_width = 4 * width;
3018 uint8_t *s = src_bytes;
3019 uint8_t *d = dst_bytes;
3020 src_bytes += src_stride;
3021 dst_bytes += dst_stride;
3024 if (w >= 1 && ((unsigned long)d & 1))
3026 *(uint8_t *)d = *(uint8_t *)s;
3032 if (w >= 2 && ((unsigned long)d & 3))
3034 *(uint16_t *)d = *(uint16_t *)s;
3040 while (w >= 4 && ((unsigned long)d & 7))
3042 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3051 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3053 "movq (%1), %%mm0\n"
3054 "movq 8(%1), %%mm1\n"
3055 "movq 16(%1), %%mm2\n"
3056 "movq 24(%1), %%mm3\n"
3057 "movq 32(%1), %%mm4\n"
3058 "movq 40(%1), %%mm5\n"
3059 "movq 48(%1), %%mm6\n"
3060 "movq 56(%1), %%mm7\n"
3062 "movq %%mm0, (%0)\n"
3063 "movq %%mm1, 8(%0)\n"
3064 "movq %%mm2, 16(%0)\n"
3065 "movq %%mm3, 24(%0)\n"
3066 "movq %%mm4, 32(%0)\n"
3067 "movq %%mm5, 40(%0)\n"
3068 "movq %%mm6, 48(%0)\n"
3069 "movq %%mm7, 56(%0)\n"
3073 "%mm0", "%mm1", "%mm2", "%mm3",
3074 "%mm4", "%mm5", "%mm6", "%mm7");
3076 __m64 v0 = ldq_u ((__m64 *)(s + 0));
3077 __m64 v1 = ldq_u ((__m64 *)(s + 8));
3078 __m64 v2 = ldq_u ((__m64 *)(s + 16));
3079 __m64 v3 = ldq_u ((__m64 *)(s + 24));
3080 __m64 v4 = ldq_u ((__m64 *)(s + 32));
3081 __m64 v5 = ldq_u ((__m64 *)(s + 40));
3082 __m64 v6 = ldq_u ((__m64 *)(s + 48));
3083 __m64 v7 = ldq_u ((__m64 *)(s + 56));
3084 *(__m64 *)(d + 0) = v0;
3085 *(__m64 *)(d + 8) = v1;
3086 *(__m64 *)(d + 16) = v2;
3087 *(__m64 *)(d + 24) = v3;
3088 *(__m64 *)(d + 32) = v4;
3089 *(__m64 *)(d + 40) = v5;
3090 *(__m64 *)(d + 48) = v6;
3091 *(__m64 *)(d + 56) = v7;
3100 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3108 *(uint16_t *)d = *(uint16_t *)s;
3121 mmx_composite_copy_area (pixman_implementation_t *imp,
3122 pixman_composite_info_t *info)
3124 PIXMAN_COMPOSITE_ARGS (info);
3126 pixman_blt_mmx (src_image->bits.bits,
3127 dest_image->bits.bits,
3128 src_image->bits.rowstride,
3129 dest_image->bits.rowstride,
3130 PIXMAN_FORMAT_BPP (src_image->bits.format),
3131 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3132 src_x, src_y, dest_x, dest_y, width, height);
3136 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3137 pixman_composite_info_t *info)
3139 PIXMAN_COMPOSITE_ARGS (info);
3140 uint32_t *src, *src_line;
3141 uint32_t *dst, *dst_line;
3142 uint8_t *mask, *mask_line;
3143 int src_stride, mask_stride, dst_stride;
3146 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3147 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3148 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3153 src_line += src_stride;
3155 dst_line += dst_stride;
3157 mask_line += mask_stride;
3167 uint32_t ssrc = *src | 0xff000000;
3168 __m64 s = load8888 (&ssrc);
3176 __m64 sa = expand_alpha (s);
3177 __m64 vm = expand_alpha_rev (to_m64 (m));
3178 __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3180 store8888 (dst, vdest);
3193 static const pixman_fast_path_t mmx_fast_paths[] =
3195 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
3196 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
3197 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
3198 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
3199 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
3200 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
3201 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3202 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3203 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
3204 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3205 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3206 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
3207 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3208 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3209 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
3210 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3211 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3212 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
3213 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
3214 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
3215 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
3216 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
3217 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
3218 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
3219 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
3220 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
3221 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
3222 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3223 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
3224 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
3225 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
3226 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
3227 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3228 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
3229 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3230 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3232 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
3233 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
3234 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
3235 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
3236 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
3237 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
3239 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
3240 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
3241 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
3242 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
3244 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
3245 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
3246 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
3247 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
3248 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
3249 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
3250 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3251 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3252 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3253 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3254 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
3255 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
3257 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
3258 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
3263 static pixman_bool_t
3264 mmx_blt (pixman_implementation_t *imp,
3265 uint32_t * src_bits,
3266 uint32_t * dst_bits,
3278 if (!pixman_blt_mmx (
3279 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3280 src_x, src_y, dest_x, dest_y, width, height))
3283 return _pixman_implementation_blt (
3285 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3286 src_x, src_y, dest_x, dest_y, width, height);
3292 static pixman_bool_t
3293 mmx_fill (pixman_implementation_t *imp,
3303 if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
3305 return _pixman_implementation_fill (
3306 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
3312 pixman_implementation_t *
3313 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
3315 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
3317 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
3318 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
3319 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
3320 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
3321 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
3322 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
3323 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
3324 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
3325 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
3326 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
3327 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
3329 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
3330 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
3331 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
3332 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
3333 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
3334 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
3335 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
3336 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
3337 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
3338 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
3339 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
3342 imp->fill = mmx_fill;
3347 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */