2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
29 * Based on work by Owen Taylor
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
49 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
55 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
56 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
64 # if (defined(__SUNPRO_C) || defined(_MSC_VER))
65 # include <xmmintrin.h>
67 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
68 * instructions to be generated that we don't want. Just duplicate the
69 * functions we want to use. */
70 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
71 _mm_movemask_pi8 (__m64 __A)
75 asm ("pmovmskb %1, %0\n\t"
83 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
84 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
86 asm ("pmulhuw %1, %0\n\t"
94 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
99 asm ("pshufw %2, %1, %0\n\t"
101 : "y" (__A), "K" (__N)
107 # define _mm_shuffle_pi16(A, N) \
108 ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
114 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
115 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
118 /* Notes about writing mmx code
120 * give memory operands as the second operand. If you give it as the
121 * first, gcc will first load it into a register, then use that
126 * _mm_mullo_pi16 (x, mmx_constant);
130 * _mm_mullo_pi16 (mmx_constant, x);
132 * Also try to minimize dependencies. i.e. when you need a value, try
133 * to calculate it from a value that was calculated as early as
137 /* --------------- MMX primitives ------------------------------------- */
139 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
140 * the name of the member used to access the data.
141 * If __m64 requires using mm_cvt* intrinsics functions to convert between
142 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
143 * If __m64 and uint64_t values can just be cast to each other directly,
144 * then define USE_M64_CASTS.
145 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
148 # define M64_MEMBER m64_u64
150 # define USE_CVT_INTRINSICS
151 #elif defined(USE_LOONGSON_MMI)
152 # define USE_M64_DOUBLE
153 #elif defined(__GNUC__)
154 # define USE_M64_CASTS
155 #elif defined(__SUNPRO_C)
156 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
157 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
158 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
159 * is defined. If it is used, then the mm_cvt* intrinsics must be used.
161 # define USE_CVT_INTRINSICS
163 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
164 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
166 # define M64_MEMBER l_
170 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
171 typedef uint64_t mmxdatafield;
173 typedef __m64 mmxdatafield;
178 mmxdatafield mmx_4x00ff;
179 mmxdatafield mmx_4x0080;
180 mmxdatafield mmx_565_rgb;
181 mmxdatafield mmx_565_unpack_multiplier;
182 mmxdatafield mmx_565_r;
183 mmxdatafield mmx_565_g;
184 mmxdatafield mmx_565_b;
185 #ifndef USE_LOONGSON_MMI
186 mmxdatafield mmx_mask_0;
187 mmxdatafield mmx_mask_1;
188 mmxdatafield mmx_mask_2;
189 mmxdatafield mmx_mask_3;
191 mmxdatafield mmx_full_alpha;
192 mmxdatafield mmx_4x0101;
193 mmxdatafield mmx_ff000000;
196 #if defined(_MSC_VER)
197 # define MMXDATA_INIT(field, val) { val ## UI64 }
198 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
199 # define MMXDATA_INIT(field, val) field = { val ## ULL }
200 #else /* mmxdatafield is an integral type */
201 # define MMXDATA_INIT(field, val) field = val ## ULL
204 static const mmx_data_t c =
206 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
207 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
208 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
209 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
210 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
211 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
212 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
213 #ifndef USE_LOONGSON_MMI
214 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
215 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
216 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
217 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
219 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
220 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
221 MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
224 #ifdef USE_CVT_INTRINSICS
225 # define MC(x) to_m64 (c.mmx_ ## x)
226 #elif defined(USE_M64_CASTS)
227 # define MC(x) ((__m64)c.mmx_ ## x)
228 #elif defined(USE_M64_DOUBLE)
229 # define MC(x) (*(__m64 *)&c.mmx_ ## x)
231 # define MC(x) c.mmx_ ## x
234 static force_inline __m64
237 #ifdef USE_CVT_INTRINSICS
238 return _mm_cvtsi64_m64 (x);
239 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
244 #elif defined USE_M64_DOUBLE
246 #else /* USE_M64_CASTS */
251 static force_inline uint64_t
254 #ifdef USE_CVT_INTRINSICS
255 return _mm_cvtm64_si64 (x);
256 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
257 uint64_t res = x.M64_MEMBER;
259 #elif defined USE_M64_DOUBLE
260 return *(uint64_t *)&x;
261 #else /* USE_M64_CASTS */
266 static force_inline __m64
271 return _mm_slli_si64 (v, s);
273 return _mm_srli_si64 (v, -s);
278 static force_inline __m64
281 return _mm_xor_si64 (mask, MC (4x00ff));
284 static force_inline __m64
285 pix_multiply (__m64 a, __m64 b)
289 res = _mm_mullo_pi16 (a, b);
290 res = _mm_adds_pu16 (res, MC (4x0080));
291 res = _mm_mulhi_pu16 (res, MC (4x0101));
296 static force_inline __m64
297 pix_add (__m64 a, __m64 b)
299 return _mm_adds_pu8 (a, b);
302 static force_inline __m64
303 expand_alpha (__m64 pixel)
305 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
308 static force_inline __m64
309 expand_alpha_rev (__m64 pixel)
311 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
314 static force_inline __m64
315 invert_colors (__m64 pixel)
317 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
320 static force_inline __m64
325 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
328 static force_inline __m64
329 over_rev_non_pre (__m64 src, __m64 dest)
331 __m64 srca = expand_alpha (src);
332 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
334 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
337 static force_inline __m64
338 in (__m64 src, __m64 mask)
340 return pix_multiply (src, mask);
344 static force_inline __m64
345 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
347 return over (in (src, mask), pix_multiply (srca, mask), dest);
352 #define in_over(src, srca, mask, dest) \
353 over (in (src, mask), pix_multiply (srca, mask), dest)
357 /* Elemental unaligned loads */
359 static force_inline __m64 ldq_u(__m64 *p)
362 /* x86's alignment restrictions are very relaxed. */
364 #elif defined USE_ARM_IWMMXT
365 int align = (uintptr_t)p & 7;
369 aligned_p = (__m64 *)((uintptr_t)p & ~7);
370 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
372 struct __una_u64 { __m64 x __attribute__((packed)); };
373 const struct __una_u64 *ptr = (const struct __una_u64 *) p;
374 return (__m64) ptr->x;
378 static force_inline uint32_t ldl_u(const uint32_t *p)
381 /* x86's alignment restrictions are very relaxed. */
384 struct __una_u32 { uint32_t x __attribute__((packed)); };
385 const struct __una_u32 *ptr = (const struct __una_u32 *) p;
390 static force_inline __m64
391 load (const uint32_t *v)
393 #ifdef USE_LOONGSON_MMI
395 asm ("lwc1 %0, %1\n\t"
401 return _mm_cvtsi32_si64 (*v);
405 static force_inline __m64
406 load8888 (const uint32_t *v)
408 #ifdef USE_LOONGSON_MMI
409 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
411 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
415 static force_inline __m64
416 load8888u (const uint32_t *v)
418 uint32_t l = ldl_u (v);
419 return load8888 (&l);
422 static force_inline __m64
423 pack8888 (__m64 lo, __m64 hi)
425 return _mm_packs_pu16 (lo, hi);
428 static force_inline void
429 store (uint32_t *dest, __m64 v)
431 #ifdef USE_LOONGSON_MMI
432 asm ("swc1 %1, %0\n\t"
438 *dest = _mm_cvtsi64_si32 (v);
442 static force_inline void
443 store8888 (uint32_t *dest, __m64 v)
445 v = pack8888 (v, _mm_setzero_si64 ());
449 static force_inline pixman_bool_t
450 is_equal (__m64 a, __m64 b)
452 #ifdef USE_LOONGSON_MMI
453 /* __m64 is double, we can compare directly. */
456 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
460 static force_inline pixman_bool_t
463 #ifdef USE_LOONGSON_MMI
464 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
466 __m64 ffs = _mm_cmpeq_pi8 (v, v);
467 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
471 static force_inline pixman_bool_t
474 return is_equal (v, _mm_setzero_si64 ());
477 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
481 * --- Expanding 565 in the low word ---
483 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
484 * m = m & (01f0003f001f);
485 * m = m * (008404100840);
488 * Note the trick here - the top word is shifted by another nibble to
489 * avoid it bumping into the middle word
491 static force_inline __m64
492 expand565 (__m64 pixel, int pos)
497 /* move pixel to low 16 bit and zero the rest */
498 #ifdef USE_LOONGSON_MMI
499 p = loongson_extract_pi16 (p, pos);
501 p = shift (shift (p, (3 - pos) * 16), -48);
504 t1 = shift (p, 36 - 11);
505 t2 = shift (p, 16 - 5);
507 p = _mm_or_si64 (t1, p);
508 p = _mm_or_si64 (t2, p);
509 p = _mm_and_si64 (p, MC (565_rgb));
511 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
512 return _mm_srli_pi16 (pixel, 8);
515 static force_inline __m64
516 expand8888 (__m64 in, int pos)
519 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
521 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
524 static force_inline __m64
525 expandx888 (__m64 in, int pos)
527 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
530 static force_inline __m64
531 pack_565 (__m64 pixel, __m64 target, int pos)
537 r = _mm_and_si64 (p, MC (565_r));
538 g = _mm_and_si64 (p, MC (565_g));
539 b = _mm_and_si64 (p, MC (565_b));
541 #ifdef USE_LOONGSON_MMI
542 r = shift (r, -(32 - 8));
543 g = shift (g, -(16 - 3));
544 b = shift (b, -(0 + 3));
546 p = _mm_or_si64 (r, g);
547 p = _mm_or_si64 (p, b);
548 return loongson_insert_pi16 (t, p, pos);
550 r = shift (r, -(32 - 8) + pos * 16);
551 g = shift (g, -(16 - 3) + pos * 16);
552 b = shift (b, -(0 + 3) + pos * 16);
555 t = _mm_and_si64 (t, MC (mask_0));
557 t = _mm_and_si64 (t, MC (mask_1));
559 t = _mm_and_si64 (t, MC (mask_2));
561 t = _mm_and_si64 (t, MC (mask_3));
563 p = _mm_or_si64 (r, t);
564 p = _mm_or_si64 (g, p);
566 return _mm_or_si64 (b, p);
572 static force_inline __m64
573 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
575 x = pix_multiply (x, a);
576 y = pix_multiply (y, b);
578 return pix_add (x, y);
583 #define pix_add_mul(x, a, y, b) \
584 ( x = pix_multiply (x, a), \
585 y = pix_multiply (y, b), \
590 /* --------------- MMX code patch for fbcompose.c --------------------- */
592 static force_inline __m64
593 combine (const uint32_t *src, const uint32_t *mask)
595 __m64 vsrc = load8888 (src);
599 __m64 m = load8888 (mask);
601 m = expand_alpha (m);
602 vsrc = pix_multiply (vsrc, m);
609 mmx_combine_over_u (pixman_implementation_t *imp,
612 const uint32_t * src,
613 const uint32_t * mask,
616 const uint32_t *end = dest + width;
620 __m64 vsrc = combine (src, mask);
622 if (is_opaque (vsrc))
624 store8888 (dest, vsrc);
626 else if (!is_zero (vsrc))
628 __m64 sa = expand_alpha (vsrc);
629 store8888 (dest, over (vsrc, sa, load8888 (dest)));
641 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
644 const uint32_t * src,
645 const uint32_t * mask,
648 const uint32_t *end = dest + width;
653 __m64 s = combine (src, mask);
656 da = expand_alpha (d);
657 store8888 (dest, over (d, da, s));
668 mmx_combine_in_u (pixman_implementation_t *imp,
671 const uint32_t * src,
672 const uint32_t * mask,
675 const uint32_t *end = dest + width;
680 __m64 x = combine (src, mask);
683 a = expand_alpha (a);
684 x = pix_multiply (x, a);
697 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
700 const uint32_t * src,
701 const uint32_t * mask,
704 const uint32_t *end = dest + width;
708 __m64 a = combine (src, mask);
712 a = expand_alpha (a);
713 x = pix_multiply (x, a);
725 mmx_combine_out_u (pixman_implementation_t *imp,
728 const uint32_t * src,
729 const uint32_t * mask,
732 const uint32_t *end = dest + width;
737 __m64 x = combine (src, mask);
740 a = expand_alpha (a);
742 x = pix_multiply (x, a);
754 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
757 const uint32_t * src,
758 const uint32_t * mask,
761 const uint32_t *end = dest + width;
765 __m64 a = combine (src, mask);
769 a = expand_alpha (a);
771 x = pix_multiply (x, a);
784 mmx_combine_atop_u (pixman_implementation_t *imp,
787 const uint32_t * src,
788 const uint32_t * mask,
791 const uint32_t *end = dest + width;
796 __m64 s = combine (src, mask);
799 sia = expand_alpha (s);
801 da = expand_alpha (d);
802 s = pix_add_mul (s, da, d, sia);
814 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
817 const uint32_t * src,
818 const uint32_t * mask,
828 __m64 s = combine (src, mask);
831 sa = expand_alpha (s);
832 dia = expand_alpha (d);
834 s = pix_add_mul (s, dia, d, sa);
846 mmx_combine_xor_u (pixman_implementation_t *imp,
849 const uint32_t * src,
850 const uint32_t * mask,
853 const uint32_t *end = dest + width;
858 __m64 s = combine (src, mask);
861 sia = expand_alpha (s);
862 dia = expand_alpha (d);
865 s = pix_add_mul (s, dia, d, sia);
877 mmx_combine_add_u (pixman_implementation_t *imp,
880 const uint32_t * src,
881 const uint32_t * mask,
884 const uint32_t *end = dest + width;
889 __m64 s = combine (src, mask);
904 mmx_combine_saturate_u (pixman_implementation_t *imp,
907 const uint32_t * src,
908 const uint32_t * mask,
911 const uint32_t *end = dest + width;
917 __m64 ms = combine (src, mask);
918 __m64 md = load8888 (dest);
926 uint32_t quot = DIV_UN8 (da, sa) << 24;
927 __m64 msa = load8888 (");
928 msa = expand_alpha (msa);
929 ms = pix_multiply (ms, msa);
932 md = pix_add (md, ms);
933 store8888 (dest, md);
944 mmx_combine_src_ca (pixman_implementation_t *imp,
947 const uint32_t * src,
948 const uint32_t * mask,
951 const uint32_t *end = src + width;
955 __m64 a = load8888 (mask);
956 __m64 s = load8888 (src);
958 s = pix_multiply (s, a);
969 mmx_combine_over_ca (pixman_implementation_t *imp,
972 const uint32_t * src,
973 const uint32_t * mask,
976 const uint32_t *end = src + width;
980 __m64 a = load8888 (mask);
981 __m64 s = load8888 (src);
982 __m64 d = load8888 (dest);
983 __m64 sa = expand_alpha (s);
985 store8888 (dest, in_over (s, sa, a, d));
995 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
998 const uint32_t * src,
999 const uint32_t * mask,
1002 const uint32_t *end = src + width;
1006 __m64 a = load8888 (mask);
1007 __m64 s = load8888 (src);
1008 __m64 d = load8888 (dest);
1009 __m64 da = expand_alpha (d);
1011 store8888 (dest, over (d, da, in (s, a)));
1021 mmx_combine_in_ca (pixman_implementation_t *imp,
1024 const uint32_t * src,
1025 const uint32_t * mask,
1028 const uint32_t *end = src + width;
1032 __m64 a = load8888 (mask);
1033 __m64 s = load8888 (src);
1034 __m64 d = load8888 (dest);
1035 __m64 da = expand_alpha (d);
1037 s = pix_multiply (s, a);
1038 s = pix_multiply (s, da);
1039 store8888 (dest, s);
1049 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1052 const uint32_t * src,
1053 const uint32_t * mask,
1056 const uint32_t *end = src + width;
1060 __m64 a = load8888 (mask);
1061 __m64 s = load8888 (src);
1062 __m64 d = load8888 (dest);
1063 __m64 sa = expand_alpha (s);
1065 a = pix_multiply (a, sa);
1066 d = pix_multiply (d, a);
1067 store8888 (dest, d);
1077 mmx_combine_out_ca (pixman_implementation_t *imp,
1080 const uint32_t * src,
1081 const uint32_t * mask,
1084 const uint32_t *end = src + width;
1088 __m64 a = load8888 (mask);
1089 __m64 s = load8888 (src);
1090 __m64 d = load8888 (dest);
1091 __m64 da = expand_alpha (d);
1094 s = pix_multiply (s, a);
1095 s = pix_multiply (s, da);
1096 store8888 (dest, s);
1106 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1109 const uint32_t * src,
1110 const uint32_t * mask,
1113 const uint32_t *end = src + width;
1117 __m64 a = load8888 (mask);
1118 __m64 s = load8888 (src);
1119 __m64 d = load8888 (dest);
1120 __m64 sa = expand_alpha (s);
1122 a = pix_multiply (a, sa);
1124 d = pix_multiply (d, a);
1125 store8888 (dest, d);
1135 mmx_combine_atop_ca (pixman_implementation_t *imp,
1138 const uint32_t * src,
1139 const uint32_t * mask,
1142 const uint32_t *end = src + width;
1146 __m64 a = load8888 (mask);
1147 __m64 s = load8888 (src);
1148 __m64 d = load8888 (dest);
1149 __m64 da = expand_alpha (d);
1150 __m64 sa = expand_alpha (s);
1152 s = pix_multiply (s, a);
1153 a = pix_multiply (a, sa);
1155 d = pix_add_mul (d, a, s, da);
1156 store8888 (dest, d);
1166 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1169 const uint32_t * src,
1170 const uint32_t * mask,
1173 const uint32_t *end = src + width;
1177 __m64 a = load8888 (mask);
1178 __m64 s = load8888 (src);
1179 __m64 d = load8888 (dest);
1180 __m64 da = expand_alpha (d);
1181 __m64 sa = expand_alpha (s);
1183 s = pix_multiply (s, a);
1184 a = pix_multiply (a, sa);
1186 d = pix_add_mul (d, a, s, da);
1187 store8888 (dest, d);
1197 mmx_combine_xor_ca (pixman_implementation_t *imp,
1200 const uint32_t * src,
1201 const uint32_t * mask,
1204 const uint32_t *end = src + width;
1208 __m64 a = load8888 (mask);
1209 __m64 s = load8888 (src);
1210 __m64 d = load8888 (dest);
1211 __m64 da = expand_alpha (d);
1212 __m64 sa = expand_alpha (s);
1214 s = pix_multiply (s, a);
1215 a = pix_multiply (a, sa);
1218 d = pix_add_mul (d, a, s, da);
1219 store8888 (dest, d);
1229 mmx_combine_add_ca (pixman_implementation_t *imp,
1232 const uint32_t * src,
1233 const uint32_t * mask,
1236 const uint32_t *end = src + width;
1240 __m64 a = load8888 (mask);
1241 __m64 s = load8888 (src);
1242 __m64 d = load8888 (dest);
1244 s = pix_multiply (s, a);
1246 store8888 (dest, d);
1255 /* ------------- MMX code paths called from fbpict.c -------------------- */
1258 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1259 pixman_composite_info_t *info)
1261 PIXMAN_COMPOSITE_ARGS (info);
1263 uint32_t *dst_line, *dst;
1270 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1275 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1277 vsrc = load8888 (&src);
1278 vsrca = expand_alpha (vsrc);
1283 dst_line += dst_stride;
1288 while (w && (unsigned long)dst & 7)
1290 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1301 vdest = *(__m64 *)dst;
1303 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1304 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1306 *(__m64 *)dst = pack8888 (dest0, dest1);
1316 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1324 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1325 pixman_composite_info_t *info)
1327 PIXMAN_COMPOSITE_ARGS (info);
1329 uint16_t *dst_line, *dst;
1336 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1341 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1343 vsrc = load8888 (&src);
1344 vsrca = expand_alpha (vsrc);
1349 dst_line += dst_stride;
1354 while (w && (unsigned long)dst & 7)
1357 __m64 vdest = expand565 (to_m64 (d), 0);
1359 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1360 *dst = to_uint64 (vdest);
1370 vdest = *(__m64 *)dst;
1372 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
1373 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
1374 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
1375 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
1377 *(__m64 *)dst = vdest;
1388 __m64 vdest = expand565 (to_m64 (d), 0);
1390 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1391 *dst = to_uint64 (vdest);
1402 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1403 pixman_composite_info_t *info)
1405 PIXMAN_COMPOSITE_ARGS (info);
1408 uint32_t *mask_line;
1409 int dst_stride, mask_stride;
1414 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1419 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1420 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1422 vsrc = load8888 (&src);
1423 vsrca = expand_alpha (vsrc);
1428 uint32_t *p = (uint32_t *)mask_line;
1429 uint32_t *q = (uint32_t *)dst_line;
1431 while (twidth && (unsigned long)q & 7)
1433 uint32_t m = *(uint32_t *)p;
1437 __m64 vdest = load8888 (q);
1438 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1439 store8888 (q, vdest);
1456 __m64 vdest = *(__m64 *)q;
1458 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1459 expand8888 (vdest, 0));
1460 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1461 expand8888 (vdest, 1));
1463 *(__m64 *)q = pack8888 (dest0, dest1);
1473 uint32_t m = *(uint32_t *)p;
1477 __m64 vdest = load8888 (q);
1478 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1479 store8888 (q, vdest);
1487 dst_line += dst_stride;
1488 mask_line += mask_stride;
1495 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1496 pixman_composite_info_t *info)
1498 PIXMAN_COMPOSITE_ARGS (info);
1499 uint32_t *dst_line, *dst;
1500 uint32_t *src_line, *src;
1503 int dst_stride, src_stride;
1508 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1509 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1511 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1513 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1514 vmask = load8888 (&mask);
1519 dst_line += dst_stride;
1521 src_line += src_stride;
1524 while (w && (unsigned long)dst & 7)
1526 __m64 s = load8888 (src);
1527 __m64 d = load8888 (dst);
1529 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1538 __m64 vs = ldq_u ((__m64 *)src);
1539 __m64 vd = *(__m64 *)dst;
1540 __m64 vsrc0 = expand8888 (vs, 0);
1541 __m64 vsrc1 = expand8888 (vs, 1);
1543 *(__m64 *)dst = pack8888 (
1544 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1545 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1554 __m64 s = load8888 (src);
1555 __m64 d = load8888 (dst);
1557 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1565 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1566 pixman_composite_info_t *info)
1568 PIXMAN_COMPOSITE_ARGS (info);
1569 uint32_t *dst_line, *dst;
1570 uint32_t *src_line, *src;
1573 int dst_stride, src_stride;
1579 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1580 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1581 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1584 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1585 vmask = load8888 (&mask);
1591 dst_line += dst_stride;
1593 src_line += src_stride;
1596 while (w && (unsigned long)dst & 7)
1598 uint32_t ssrc = *src | 0xff000000;
1599 __m64 s = load8888 (&ssrc);
1600 __m64 d = load8888 (dst);
1602 store8888 (dst, in_over (s, srca, vmask, d));
1611 __m64 vd0 = *(__m64 *)(dst + 0);
1612 __m64 vd1 = *(__m64 *)(dst + 2);
1613 __m64 vd2 = *(__m64 *)(dst + 4);
1614 __m64 vd3 = *(__m64 *)(dst + 6);
1615 __m64 vd4 = *(__m64 *)(dst + 8);
1616 __m64 vd5 = *(__m64 *)(dst + 10);
1617 __m64 vd6 = *(__m64 *)(dst + 12);
1618 __m64 vd7 = *(__m64 *)(dst + 14);
1620 __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1621 __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1622 __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1623 __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1624 __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1625 __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1626 __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1627 __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1630 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1631 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1634 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1635 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1638 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1639 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1642 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1643 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1646 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1647 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1650 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1651 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1654 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1655 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1658 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1659 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1661 *(__m64 *)(dst + 0) = vd0;
1662 *(__m64 *)(dst + 2) = vd1;
1663 *(__m64 *)(dst + 4) = vd2;
1664 *(__m64 *)(dst + 6) = vd3;
1665 *(__m64 *)(dst + 8) = vd4;
1666 *(__m64 *)(dst + 10) = vd5;
1667 *(__m64 *)(dst + 12) = vd6;
1668 *(__m64 *)(dst + 14) = vd7;
1677 uint32_t ssrc = *src | 0xff000000;
1678 __m64 s = load8888 (&ssrc);
1679 __m64 d = load8888 (dst);
1681 store8888 (dst, in_over (s, srca, vmask, d));
1693 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1694 pixman_composite_info_t *info)
1696 PIXMAN_COMPOSITE_ARGS (info);
1697 uint32_t *dst_line, *dst;
1698 uint32_t *src_line, *src;
1700 int dst_stride, src_stride;
1706 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1707 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1712 dst_line += dst_stride;
1714 src_line += src_stride;
1730 sa = expand_alpha (ms);
1731 store8888 (dst, over (ms, sa, load8888 (dst)));
1741 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1742 pixman_composite_info_t *info)
1744 PIXMAN_COMPOSITE_ARGS (info);
1745 uint16_t *dst_line, *dst;
1746 uint32_t *src_line, *src;
1747 int dst_stride, src_stride;
1752 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1753 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1757 assert (src_image->drawable == mask_image->drawable);
1763 dst_line += dst_stride;
1765 src_line += src_stride;
1770 while (w && (unsigned long)dst & 7)
1772 __m64 vsrc = load8888 (src);
1774 __m64 vdest = expand565 (to_m64 (d), 0);
1777 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1779 *dst = to_uint64 (vdest);
1790 __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1793 vsrc0 = load8888 ((src + 0));
1794 vsrc1 = load8888 ((src + 1));
1795 vsrc2 = load8888 ((src + 2));
1796 vsrc3 = load8888 ((src + 3));
1798 vdest = *(__m64 *)dst;
1800 vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
1801 vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
1802 vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
1803 vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
1805 *(__m64 *)dst = vdest;
1816 __m64 vsrc = load8888 (src);
1818 __m64 vdest = expand565 (to_m64 (d), 0);
1820 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1822 *dst = to_uint64 (vdest);
1834 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1835 pixman_composite_info_t *info)
1837 PIXMAN_COMPOSITE_ARGS (info);
1839 uint32_t *dst_line, *dst;
1840 uint8_t *mask_line, *mask;
1841 int dst_stride, mask_stride;
1848 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1854 srcsrc = (uint64_t)src << 32 | src;
1856 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1857 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1859 vsrc = load8888 (&src);
1860 vsrca = expand_alpha (vsrc);
1865 dst_line += dst_stride;
1867 mask_line += mask_stride;
1872 while (w && (unsigned long)dst & 7)
1878 __m64 vdest = in_over (vsrc, vsrca,
1879 expand_alpha_rev (to_m64 (m)),
1882 store8888 (dst, vdest);
1899 if (srca == 0xff && (m0 & m1) == 0xff)
1901 *(uint64_t *)dst = srcsrc;
1908 vdest = *(__m64 *)dst;
1910 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
1911 expand8888 (vdest, 0));
1912 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
1913 expand8888 (vdest, 1));
1915 *(__m64 *)dst = pack8888 (dest0, dest1);
1931 __m64 vdest = load8888 (dst);
1934 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
1935 store8888 (dst, vdest);
1944 pixman_fill_mmx (uint32_t *bits,
1955 uint32_t byte_width;
1958 #if defined __GNUC__ && defined USE_X86_MMX
1959 __m64 v1, v2, v3, v4, v5, v6, v7;
1962 if (bpp != 16 && bpp != 32 && bpp != 8)
1967 stride = stride * (int) sizeof (uint32_t) / 1;
1968 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
1971 xor = (xor & 0xff) * 0x01010101;
1975 stride = stride * (int) sizeof (uint32_t) / 2;
1976 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
1977 byte_width = 2 * width;
1979 xor = (xor & 0xffff) * 0x00010001;
1983 stride = stride * (int) sizeof (uint32_t) / 4;
1984 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
1985 byte_width = 4 * width;
1989 fill = ((uint64_t)xor << 32) | xor;
1990 vfill = to_m64 (fill);
1992 #if defined __GNUC__ && defined USE_X86_MMX
2001 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2002 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2009 uint8_t *d = byte_line;
2011 byte_line += stride;
2014 if (w >= 1 && ((unsigned long)d & 1))
2016 *(uint8_t *)d = (xor & 0xff);
2021 if (w >= 2 && ((unsigned long)d & 3))
2023 *(uint16_t *)d = xor;
2028 while (w >= 4 && ((unsigned long)d & 7))
2030 *(uint32_t *)d = xor;
2038 #if defined __GNUC__ && defined USE_X86_MMX
2050 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2051 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2054 *(__m64*) (d + 0) = vfill;
2055 *(__m64*) (d + 8) = vfill;
2056 *(__m64*) (d + 16) = vfill;
2057 *(__m64*) (d + 24) = vfill;
2058 *(__m64*) (d + 32) = vfill;
2059 *(__m64*) (d + 40) = vfill;
2060 *(__m64*) (d + 48) = vfill;
2061 *(__m64*) (d + 56) = vfill;
2069 *(uint32_t *)d = xor;
2076 *(uint16_t *)d = xor;
2082 *(uint8_t *)d = (xor & 0xff);
2094 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2095 pixman_composite_info_t *info)
2097 PIXMAN_COMPOSITE_ARGS (info);
2099 uint32_t *dst_line, *dst;
2100 uint8_t *mask_line, *mask;
2101 int dst_stride, mask_stride;
2108 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2113 pixman_fill_mmx (dest_image->bits.bits, dest_image->bits.rowstride,
2114 PIXMAN_FORMAT_BPP (dest_image->bits.format),
2115 dest_x, dest_y, width, height, 0);
2119 srcsrc = (uint64_t)src << 32 | src;
2121 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2122 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2124 vsrc = load8888 (&src);
2129 dst_line += dst_stride;
2131 mask_line += mask_stride;
2136 while (w && (unsigned long)dst & 7)
2142 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2144 store8888 (dst, vdest);
2164 if (srca == 0xff && (m0 & m1) == 0xff)
2166 *(uint64_t *)dst = srcsrc;
2172 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2173 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2175 *(__m64 *)dst = pack8888 (dest0, dest1);
2179 *(uint64_t *)dst = 0;
2195 __m64 vdest = load8888 (dst);
2197 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2198 store8888 (dst, vdest);
2211 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2212 pixman_composite_info_t *info)
2214 PIXMAN_COMPOSITE_ARGS (info);
2216 uint16_t *dst_line, *dst;
2217 uint8_t *mask_line, *mask;
2218 int dst_stride, mask_stride;
2220 __m64 vsrc, vsrca, tmp;
2225 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2231 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2232 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2234 vsrc = load8888 (&src);
2235 vsrca = expand_alpha (vsrc);
2237 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2238 srcsrcsrcsrc = expand_alpha_rev (tmp);
2243 dst_line += dst_stride;
2245 mask_line += mask_stride;
2250 while (w && (unsigned long)dst & 7)
2257 __m64 vd = to_m64 (d);
2258 __m64 vdest = in_over (
2259 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2261 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2262 *dst = to_uint64 (vd);
2274 uint64_t m0, m1, m2, m3;
2280 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2282 *(__m64 *)dst = srcsrcsrcsrc;
2284 else if (m0 | m1 | m2 | m3)
2287 __m64 vm0, vm1, vm2, vm3;
2289 vdest = *(__m64 *)dst;
2292 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
2293 expand565 (vdest, 0)), vdest, 0);
2295 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
2296 expand565 (vdest, 1)), vdest, 1);
2298 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
2299 expand565 (vdest, 2)), vdest, 2);
2301 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
2302 expand565 (vdest, 3)), vdest, 3);
2304 *(__m64 *)dst = vdest;
2321 __m64 vd = to_m64 (d);
2322 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2324 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2325 *dst = to_uint64 (vd);
2338 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2339 pixman_composite_info_t *info)
2341 PIXMAN_COMPOSITE_ARGS (info);
2342 uint16_t *dst_line, *dst;
2343 uint32_t *src_line, *src;
2344 int dst_stride, src_stride;
2349 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2350 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2354 assert (src_image->drawable == mask_image->drawable);
2360 dst_line += dst_stride;
2362 src_line += src_stride;
2367 while (w && (unsigned long)dst & 7)
2369 __m64 vsrc = load8888 (src);
2371 __m64 vdest = expand565 (to_m64 (d), 0);
2373 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2375 *dst = to_uint64 (vdest);
2386 uint32_t s0, s1, s2, s3;
2387 unsigned char a0, a1, a2, a3;
2399 if ((a0 & a1 & a2 & a3) == 0xFF)
2402 vdest = pack_565 (invert_colors (load8888 (&s0)), _mm_setzero_si64 (), 0);
2403 vdest = pack_565 (invert_colors (load8888 (&s1)), vdest, 1);
2404 vdest = pack_565 (invert_colors (load8888 (&s2)), vdest, 2);
2405 vdest = pack_565 (invert_colors (load8888 (&s3)), vdest, 3);
2407 *(__m64 *)dst = vdest;
2409 else if (s0 | s1 | s2 | s3)
2411 __m64 vdest = *(__m64 *)dst;
2413 vdest = pack_565 (over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)), vdest, 0);
2414 vdest = pack_565 (over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)), vdest, 1);
2415 vdest = pack_565 (over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)), vdest, 2);
2416 vdest = pack_565 (over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)), vdest, 3);
2418 *(__m64 *)dst = vdest;
2430 __m64 vsrc = load8888 (src);
2432 __m64 vdest = expand565 (to_m64 (d), 0);
2434 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2436 *dst = to_uint64 (vdest);
2448 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2449 pixman_composite_info_t *info)
2451 PIXMAN_COMPOSITE_ARGS (info);
2452 uint32_t *dst_line, *dst;
2453 uint32_t *src_line, *src;
2454 int dst_stride, src_stride;
2459 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2460 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2464 assert (src_image->drawable == mask_image->drawable);
2470 dst_line += dst_stride;
2472 src_line += src_stride;
2475 while (w && (unsigned long)dst & 7)
2477 __m64 s = load8888 (src);
2478 __m64 d = load8888 (dst);
2480 store8888 (dst, over_rev_non_pre (s, d));
2490 unsigned char a0, a1;
2499 if ((a0 & a1) == 0xFF)
2501 d0 = invert_colors (load8888 (&s0));
2502 d1 = invert_colors (load8888 (&s1));
2504 *(__m64 *)dst = pack8888 (d0, d1);
2508 __m64 vdest = *(__m64 *)dst;
2510 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2511 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2513 *(__m64 *)dst = pack8888 (d0, d1);
2523 __m64 s = load8888 (src);
2524 __m64 d = load8888 (dst);
2526 store8888 (dst, over_rev_non_pre (s, d));
2534 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2535 pixman_composite_info_t *info)
2537 PIXMAN_COMPOSITE_ARGS (info);
2540 uint32_t *mask_line;
2541 int dst_stride, mask_stride;
2546 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2551 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2552 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2554 vsrc = load8888 (&src);
2555 vsrca = expand_alpha (vsrc);
2560 uint32_t *p = (uint32_t *)mask_line;
2561 uint16_t *q = (uint16_t *)dst_line;
2563 while (twidth && ((unsigned long)q & 7))
2565 uint32_t m = *(uint32_t *)p;
2570 __m64 vdest = expand565 (to_m64 (d), 0);
2571 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2572 *q = to_uint64 (vdest);
2582 uint32_t m0, m1, m2, m3;
2589 if ((m0 | m1 | m2 | m3))
2591 __m64 vdest = *(__m64 *)q;
2593 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)), vdest, 0);
2594 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)), vdest, 1);
2595 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)), vdest, 2);
2596 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)), vdest, 3);
2598 *(__m64 *)q = vdest;
2613 __m64 vdest = expand565 (to_m64 (d), 0);
2614 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2615 *q = to_uint64 (vdest);
2623 mask_line += mask_stride;
2624 dst_line += dst_stride;
2631 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2632 pixman_composite_info_t *info)
2634 PIXMAN_COMPOSITE_ARGS (info);
2635 uint8_t *dst_line, *dst;
2636 uint8_t *mask_line, *mask;
2637 int dst_stride, mask_stride;
2643 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2644 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2646 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2650 vsrc = load8888 (&src);
2651 vsrca = expand_alpha (vsrc);
2656 dst_line += dst_stride;
2658 mask_line += mask_stride;
2661 while (w && (unsigned long)dst & 7)
2670 m = MUL_UN8 (sa, a, tmp);
2671 d = MUL_UN8 (m, d, tmp);
2682 vmask = load8888u ((uint32_t *)mask);
2683 vdest = load8888 ((uint32_t *)dst);
2685 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2701 m = MUL_UN8 (sa, a, tmp);
2702 d = MUL_UN8 (m, d, tmp);
2712 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2713 pixman_composite_info_t *info)
2715 PIXMAN_COMPOSITE_ARGS (info);
2716 uint8_t *dst_line, *dst;
2717 uint8_t *src_line, *src;
2718 int src_stride, dst_stride;
2721 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2722 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2727 dst_line += dst_stride;
2729 src_line += src_stride;
2732 while (w && (unsigned long)dst & 3)
2740 *dst = MUL_UN8 (s, d, tmp);
2749 uint32_t *s = (uint32_t *)src;
2750 uint32_t *d = (uint32_t *)dst;
2752 store8888 (d, in (load8888u (s), load8888 (d)));
2767 *dst = MUL_UN8 (s, d, tmp);
2778 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2779 pixman_composite_info_t *info)
2781 PIXMAN_COMPOSITE_ARGS (info);
2782 uint8_t *dst_line, *dst;
2783 uint8_t *mask_line, *mask;
2784 int dst_stride, mask_stride;
2790 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2791 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2793 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2800 vsrc = load8888 (&src);
2801 vsrca = expand_alpha (vsrc);
2806 dst_line += dst_stride;
2808 mask_line += mask_stride;
2811 while (w && (unsigned long)dst & 3)
2821 m = MUL_UN8 (sa, a, tmp);
2822 r = ADD_UN8 (m, d, tmp);
2833 vmask = load8888u ((uint32_t *)mask);
2834 vdest = load8888 ((uint32_t *)dst);
2836 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
2853 m = MUL_UN8 (sa, a, tmp);
2854 r = ADD_UN8 (m, d, tmp);
2864 mmx_composite_add_8_8 (pixman_implementation_t *imp,
2865 pixman_composite_info_t *info)
2867 PIXMAN_COMPOSITE_ARGS (info);
2868 uint8_t *dst_line, *dst;
2869 uint8_t *src_line, *src;
2870 int dst_stride, src_stride;
2877 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2878 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2883 dst_line += dst_stride;
2885 src_line += src_stride;
2888 while (w && (unsigned long)dst & 7)
2893 s = t | (0 - (t >> 8));
2903 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
2914 s = t | (0 - (t >> 8));
2927 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2928 pixman_composite_info_t *info)
2930 PIXMAN_COMPOSITE_ARGS (info);
2931 uint32_t *dst_line, *dst;
2932 uint32_t *src_line, *src;
2933 int dst_stride, src_stride;
2938 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2939 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2944 dst_line += dst_stride;
2946 src_line += src_stride;
2949 while (w && (unsigned long)dst & 7)
2951 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
2952 load ((const uint32_t *)dst)));
2960 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
2968 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
2969 load ((const uint32_t *)dst)));
2977 static pixman_bool_t
2978 pixman_blt_mmx (uint32_t *src_bits,
2991 uint8_t * src_bytes;
2992 uint8_t * dst_bytes;
2995 if (src_bpp != dst_bpp)
3000 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3001 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3002 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3003 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3004 byte_width = 2 * width;
3008 else if (src_bpp == 32)
3010 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3011 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3012 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3013 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3014 byte_width = 4 * width;
3026 uint8_t *s = src_bytes;
3027 uint8_t *d = dst_bytes;
3028 src_bytes += src_stride;
3029 dst_bytes += dst_stride;
3032 if (w >= 1 && ((unsigned long)d & 1))
3034 *(uint8_t *)d = *(uint8_t *)s;
3040 if (w >= 2 && ((unsigned long)d & 3))
3042 *(uint16_t *)d = *(uint16_t *)s;
3048 while (w >= 4 && ((unsigned long)d & 7))
3050 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3059 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3061 "movq (%1), %%mm0\n"
3062 "movq 8(%1), %%mm1\n"
3063 "movq 16(%1), %%mm2\n"
3064 "movq 24(%1), %%mm3\n"
3065 "movq 32(%1), %%mm4\n"
3066 "movq 40(%1), %%mm5\n"
3067 "movq 48(%1), %%mm6\n"
3068 "movq 56(%1), %%mm7\n"
3070 "movq %%mm0, (%0)\n"
3071 "movq %%mm1, 8(%0)\n"
3072 "movq %%mm2, 16(%0)\n"
3073 "movq %%mm3, 24(%0)\n"
3074 "movq %%mm4, 32(%0)\n"
3075 "movq %%mm5, 40(%0)\n"
3076 "movq %%mm6, 48(%0)\n"
3077 "movq %%mm7, 56(%0)\n"
3081 "%mm0", "%mm1", "%mm2", "%mm3",
3082 "%mm4", "%mm5", "%mm6", "%mm7");
3084 __m64 v0 = ldq_u ((__m64 *)(s + 0));
3085 __m64 v1 = ldq_u ((__m64 *)(s + 8));
3086 __m64 v2 = ldq_u ((__m64 *)(s + 16));
3087 __m64 v3 = ldq_u ((__m64 *)(s + 24));
3088 __m64 v4 = ldq_u ((__m64 *)(s + 32));
3089 __m64 v5 = ldq_u ((__m64 *)(s + 40));
3090 __m64 v6 = ldq_u ((__m64 *)(s + 48));
3091 __m64 v7 = ldq_u ((__m64 *)(s + 56));
3092 *(__m64 *)(d + 0) = v0;
3093 *(__m64 *)(d + 8) = v1;
3094 *(__m64 *)(d + 16) = v2;
3095 *(__m64 *)(d + 24) = v3;
3096 *(__m64 *)(d + 32) = v4;
3097 *(__m64 *)(d + 40) = v5;
3098 *(__m64 *)(d + 48) = v6;
3099 *(__m64 *)(d + 56) = v7;
3108 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3116 *(uint16_t *)d = *(uint16_t *)s;
3129 mmx_composite_copy_area (pixman_implementation_t *imp,
3130 pixman_composite_info_t *info)
3132 PIXMAN_COMPOSITE_ARGS (info);
3134 pixman_blt_mmx (src_image->bits.bits,
3135 dest_image->bits.bits,
3136 src_image->bits.rowstride,
3137 dest_image->bits.rowstride,
3138 PIXMAN_FORMAT_BPP (src_image->bits.format),
3139 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3140 src_x, src_y, dest_x, dest_y, width, height);
3144 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3145 pixman_composite_info_t *info)
3147 PIXMAN_COMPOSITE_ARGS (info);
3148 uint32_t *src, *src_line;
3149 uint32_t *dst, *dst_line;
3150 uint8_t *mask, *mask_line;
3151 int src_stride, mask_stride, dst_stride;
3154 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3155 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3156 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3161 src_line += src_stride;
3163 dst_line += dst_stride;
3165 mask_line += mask_stride;
3175 uint32_t ssrc = *src | 0xff000000;
3176 __m64 s = load8888 (&ssrc);
3184 __m64 sa = expand_alpha (s);
3185 __m64 vm = expand_alpha_rev (to_m64 (m));
3186 __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3188 store8888 (dst, vdest);
3202 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3204 int w = iter->width;
3205 uint32_t *dst = iter->buffer;
3206 uint32_t *src = (uint32_t *)iter->bits;
3208 iter->bits += iter->stride;
3210 while (w && ((unsigned long)dst) & 7)
3212 *dst++ = (*src++) | 0xff000000;
3218 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3219 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3220 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3221 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3223 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3224 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3225 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3226 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3235 *dst++ = (*src++) | 0xff000000;
3239 return iter->buffer;
3243 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3245 int w = iter->width;
3246 uint32_t *dst = iter->buffer;
3247 uint16_t *src = (uint16_t *)iter->bits;
3249 iter->bits += iter->stride;
3251 while (w && ((unsigned long)dst) & 0x0f)
3253 uint16_t s = *src++;
3255 *dst++ = CONVERT_0565_TO_8888 (s);
3261 __m64 vsrc = ldq_u ((__m64 *)src);
3263 __m64 mm0 = expand565 (vsrc, 0);
3264 __m64 mm1 = expand565 (vsrc, 1);
3265 __m64 mm2 = expand565 (vsrc, 2);
3266 __m64 mm3 = expand565 (vsrc, 3);
3268 *(__m64 *)(dst + 0) = _mm_or_si64 (pack8888 (mm0, mm1), MC (ff000000));
3269 *(__m64 *)(dst + 2) = _mm_or_si64 (pack8888 (mm2, mm3), MC (ff000000));
3278 uint16_t s = *src++;
3280 *dst++ = CONVERT_0565_TO_8888 (s);
3284 return iter->buffer;
3288 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3290 int w = iter->width;
3291 uint32_t *dst = iter->buffer;
3292 uint8_t *src = iter->bits;
3294 iter->bits += iter->stride;
3296 while (w && (((unsigned long)dst) & 15))
3298 *dst++ = *(src++) << 24;
3304 __m64 mm0 = ldq_u ((__m64 *)src);
3306 __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
3307 __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
3308 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3309 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3310 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3311 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3313 *(__m64 *)(dst + 0) = mm3;
3314 *(__m64 *)(dst + 2) = mm4;
3315 *(__m64 *)(dst + 4) = mm5;
3316 *(__m64 *)(dst + 6) = mm6;
3325 *dst++ = *(src++) << 24;
3329 return iter->buffer;
3334 pixman_format_code_t format;
3335 pixman_iter_get_scanline_t get_scanline;
3338 static const fetcher_info_t fetchers[] =
3340 { PIXMAN_x8r8g8b8, mmx_fetch_x8r8g8b8 },
3341 { PIXMAN_r5g6b5, mmx_fetch_r5g6b5 },
3342 { PIXMAN_a8, mmx_fetch_a8 },
3347 mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
3349 pixman_image_t *image = iter->image;
3352 int width = iter->width;
3353 int height = iter->height;
3356 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE)
3358 if ((iter->flags & ITER_NARROW) &&
3359 (image->common.flags & FLAGS) == FLAGS &&
3361 x + width <= image->bits.width &&
3362 y + height <= image->bits.height)
3364 const fetcher_info_t *f;
3366 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
3368 if (image->common.extended_format_code == f->format)
3370 uint8_t *b = (uint8_t *)image->bits.bits;
3371 int s = image->bits.rowstride * 4;
3373 iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
3376 iter->get_scanline = f->get_scanline;
3382 imp->delegate->src_iter_init (imp->delegate, iter);
3385 static const pixman_fast_path_t mmx_fast_paths[] =
3387 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
3388 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
3389 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
3390 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
3391 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
3392 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
3393 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3394 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3395 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
3396 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3397 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3398 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
3399 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3400 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3401 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
3402 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3403 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3404 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
3405 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
3406 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
3407 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
3408 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
3409 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
3410 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
3411 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
3412 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
3413 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
3414 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3415 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
3416 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
3417 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
3418 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
3419 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3420 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
3421 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3422 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3424 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
3425 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
3426 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
3427 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
3428 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
3429 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
3431 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
3432 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
3433 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
3434 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
3436 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
3437 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
3438 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
3439 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
3440 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
3441 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
3442 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3443 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3444 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3445 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3446 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
3447 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
3449 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
3450 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
3455 static pixman_bool_t
3456 mmx_blt (pixman_implementation_t *imp,
3457 uint32_t * src_bits,
3458 uint32_t * dst_bits,
3470 if (!pixman_blt_mmx (
3471 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3472 src_x, src_y, dest_x, dest_y, width, height))
3475 return _pixman_implementation_blt (
3477 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3478 src_x, src_y, dest_x, dest_y, width, height);
3484 static pixman_bool_t
3485 mmx_fill (pixman_implementation_t *imp,
3495 if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
3497 return _pixman_implementation_fill (
3498 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
3504 pixman_implementation_t *
3505 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
3507 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
3509 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
3510 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
3511 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
3512 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
3513 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
3514 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
3515 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
3516 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
3517 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
3518 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
3519 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
3521 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
3522 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
3523 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
3524 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
3525 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
3526 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
3527 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
3528 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
3529 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
3530 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
3531 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
3534 imp->fill = mmx_fill;
3536 imp->src_iter_init = mmx_src_iter_init;
3541 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */