2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
29 * Based on work by Owen Taylor
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT
39 #include "pixman-private.h"
40 #include "pixman-combine32.h"
45 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
51 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
52 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
60 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
61 * instructions to be generated that we don't want. Just duplicate the
62 * functions we want to use. */
63 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
64 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
66 asm("pmulhuw %1, %0\n\t"
73 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
74 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
78 asm("pshufw %2, %1, %0\n\t"
80 : "y" (__A), "K" (__N)
87 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
88 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
90 /* Notes about writing mmx code
92 * give memory operands as the second operand. If you give it as the
93 * first, gcc will first load it into a register, then use that
98 * _mm_mullo_pi16 (x, mmx_constant);
102 * _mm_mullo_pi16 (mmx_constant, x);
104 * Also try to minimize dependencies. i.e. when you need a value, try
105 * to calculate it from a value that was calculated as early as
109 /* --------------- MMX primitives ------------------------------------- */
111 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
112 * the name of the member used to access the data.
113 * If __m64 requires using mm_cvt* intrinsics functions to convert between
114 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
115 * If __m64 and uint64_t values can just be cast to each other directly,
116 * then define USE_M64_CASTS.
119 # define M64_MEMBER m64_u64
121 # define USE_CVT_INTRINSICS
122 #elif defined(__GNUC__)
123 # define USE_M64_CASTS
124 #elif defined(__SUNPRO_C)
125 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
126 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
127 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
128 * is defined. If it is used, then the mm_cvt* intrinsics must be used.
130 # define USE_CVT_INTRINSICS
132 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
133 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
135 # define M64_MEMBER l_
139 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS)
140 typedef uint64_t mmxdatafield;
142 typedef __m64 mmxdatafield;
147 mmxdatafield mmx_4x00ff;
148 mmxdatafield mmx_4x0080;
149 mmxdatafield mmx_565_rgb;
150 mmxdatafield mmx_565_unpack_multiplier;
151 mmxdatafield mmx_565_r;
152 mmxdatafield mmx_565_g;
153 mmxdatafield mmx_565_b;
154 mmxdatafield mmx_mask_0;
155 mmxdatafield mmx_mask_1;
156 mmxdatafield mmx_mask_2;
157 mmxdatafield mmx_mask_3;
158 mmxdatafield mmx_full_alpha;
159 mmxdatafield mmx_4x0101;
162 #if defined(_MSC_VER)
163 # define MMXDATA_INIT(field, val) { val ## UI64 }
164 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
165 # define MMXDATA_INIT(field, val) field = { val ## ULL }
166 #else /* mmxdatafield is an integral type */
167 # define MMXDATA_INIT(field, val) field = val ## ULL
170 static const mmx_data_t c =
172 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
173 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
174 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
175 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
176 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
177 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
178 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
179 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
180 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
181 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
182 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
183 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
184 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
187 #ifdef USE_CVT_INTRINSICS
188 # define MC(x) to_m64 (c.mmx_ ## x)
189 #elif defined(USE_M64_CASTS)
190 # define MC(x) ((__m64)c.mmx_ ## x)
192 # define MC(x) c.mmx_ ## x
195 static force_inline __m64
198 #ifdef USE_CVT_INTRINSICS
199 return _mm_cvtsi64_m64 (x);
200 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
205 #else /* USE_M64_CASTS */
210 static force_inline uint64_t
213 #ifdef USE_CVT_INTRINSICS
214 return _mm_cvtm64_si64 (x);
215 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
216 uint64_t res = x.M64_MEMBER;
218 #else /* USE_M64_CASTS */
223 static force_inline __m64
228 return _mm_slli_si64 (v, s);
230 return _mm_srli_si64 (v, -s);
235 static force_inline __m64
238 return _mm_xor_si64 (mask, MC (4x00ff));
241 static force_inline __m64
242 pix_multiply (__m64 a, __m64 b)
246 res = _mm_mullo_pi16 (a, b);
247 res = _mm_adds_pu16 (res, MC (4x0080));
248 res = _mm_mulhi_pu16 (res, MC (4x0101));
253 static force_inline __m64
254 pix_add (__m64 a, __m64 b)
256 return _mm_adds_pu8 (a, b);
259 static force_inline __m64
260 expand_alpha (__m64 pixel)
262 return _mm_shuffle_pi16(pixel, _MM_SHUFFLE (3, 3, 3, 3));
265 static force_inline __m64
266 expand_alpha_rev (__m64 pixel)
268 return _mm_shuffle_pi16(pixel, _MM_SHUFFLE (0, 0, 0, 0));
271 static force_inline __m64
272 invert_colors (__m64 pixel)
274 return _mm_shuffle_pi16(pixel, _MM_SHUFFLE (3, 0, 1, 2));
277 static force_inline __m64
282 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
285 static force_inline __m64
286 over_rev_non_pre (__m64 src, __m64 dest)
288 __m64 srca = expand_alpha (src);
289 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
291 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
294 static force_inline __m64
295 in (__m64 src, __m64 mask)
297 return pix_multiply (src, mask);
300 static force_inline __m64
301 in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
303 src = _mm_or_si64 (src, MC (full_alpha));
305 return over (in (src, mask), mask, dest);
309 static force_inline __m64
310 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
312 return over (in (src, mask), pix_multiply (srca, mask), dest);
317 #define in_over(src, srca, mask, dest) \
318 over (in (src, mask), pix_multiply (srca, mask), dest)
322 /* Elemental unaligned loads */
324 static __inline__ __m64 ldq_u(uint64_t *p)
327 /* x86's alignment restrictions are very relaxed. */
329 #elif defined USE_ARM_IWMMXT
330 int align = (uintptr_t)p & 7;
334 aligned_p = (__m64 *)((uintptr_t)p & ~7);
335 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
337 struct __una_u64 { uint64_t x __attribute__((packed)); };
338 const struct __una_u64 *ptr = (const struct __una_u64 *) p;
339 return (__m64) ptr->x;
343 static __inline__ uint32_t ldl_u(uint32_t *p)
346 /* x86's alignment restrictions are very relaxed. */
349 struct __una_u32 { uint32_t x __attribute__((packed)); };
350 const struct __una_u32 *ptr = (const struct __una_u32 *) p;
355 static force_inline __m64
356 load8888 (uint32_t v)
358 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
361 static force_inline __m64
362 pack8888 (__m64 lo, __m64 hi)
364 return _mm_packs_pu16 (lo, hi);
367 static force_inline uint32_t
370 return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
373 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
377 * --- Expanding 565 in the low word ---
379 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
380 * m = m & (01f0003f001f);
381 * m = m * (008404100840);
384 * Note the trick here - the top word is shifted by another nibble to
385 * avoid it bumping into the middle word
387 static force_inline __m64
388 expand565 (__m64 pixel, int pos)
393 /* move pixel to low 16 bit and zero the rest */
394 p = shift (shift (p, (3 - pos) * 16), -48);
396 t1 = shift (p, 36 - 11);
397 t2 = shift (p, 16 - 5);
399 p = _mm_or_si64 (t1, p);
400 p = _mm_or_si64 (t2, p);
401 p = _mm_and_si64 (p, MC (565_rgb));
403 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
404 return _mm_srli_pi16 (pixel, 8);
407 static force_inline __m64
408 expand8888 (__m64 in, int pos)
411 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
413 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
416 static force_inline __m64
417 expandx888 (__m64 in, int pos)
419 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
422 static force_inline __m64
423 pack_565 (__m64 pixel, __m64 target, int pos)
429 r = _mm_and_si64 (p, MC (565_r));
430 g = _mm_and_si64 (p, MC (565_g));
431 b = _mm_and_si64 (p, MC (565_b));
433 r = shift (r, -(32 - 8) + pos * 16);
434 g = shift (g, -(16 - 3) + pos * 16);
435 b = shift (b, -(0 + 3) + pos * 16);
438 t = _mm_and_si64 (t, MC (mask_0));
440 t = _mm_and_si64 (t, MC (mask_1));
442 t = _mm_and_si64 (t, MC (mask_2));
444 t = _mm_and_si64 (t, MC (mask_3));
446 p = _mm_or_si64 (r, t);
447 p = _mm_or_si64 (g, p);
449 return _mm_or_si64 (b, p);
454 static force_inline __m64
455 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
457 x = pix_multiply (x, a);
458 y = pix_multiply (y, b);
460 return pix_add (x, y);
465 #define pix_add_mul(x, a, y, b) \
466 ( x = pix_multiply (x, a), \
467 y = pix_multiply (y, a), \
472 /* --------------- MMX code patch for fbcompose.c --------------------- */
474 static force_inline uint32_t
475 combine (const uint32_t *src, const uint32_t *mask)
477 uint32_t ssrc = *src;
481 __m64 m = load8888 (*mask);
482 __m64 s = load8888 (ssrc);
484 m = expand_alpha (m);
485 s = pix_multiply (s, m);
487 ssrc = store8888 (s);
494 mmx_combine_over_u (pixman_implementation_t *imp,
497 const uint32_t * src,
498 const uint32_t * mask,
501 const uint32_t *end = dest + width;
505 uint32_t ssrc = combine (src, mask);
506 uint32_t a = ssrc >> 24;
516 sa = expand_alpha (s);
517 *dest = store8888 (over (s, sa, load8888 (*dest)));
529 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
532 const uint32_t * src,
533 const uint32_t * mask,
536 const uint32_t *end = dest + width;
541 uint32_t s = combine (src, mask);
543 d = load8888 (*dest);
544 da = expand_alpha (d);
545 *dest = store8888 (over (d, da, load8888 (s)));
556 mmx_combine_in_u (pixman_implementation_t *imp,
559 const uint32_t * src,
560 const uint32_t * mask,
563 const uint32_t *end = dest + width;
569 x = load8888 (combine (src, mask));
570 a = load8888 (*dest);
571 a = expand_alpha (a);
572 x = pix_multiply (x, a);
574 *dest = store8888 (x);
585 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
588 const uint32_t * src,
589 const uint32_t * mask,
592 const uint32_t *end = dest + width;
598 x = load8888 (*dest);
599 a = load8888 (combine (src, mask));
600 a = expand_alpha (a);
601 x = pix_multiply (x, a);
602 *dest = store8888 (x);
613 mmx_combine_out_u (pixman_implementation_t *imp,
616 const uint32_t * src,
617 const uint32_t * mask,
620 const uint32_t *end = dest + width;
626 x = load8888 (combine (src, mask));
627 a = load8888 (*dest);
628 a = expand_alpha (a);
630 x = pix_multiply (x, a);
631 *dest = store8888 (x);
642 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
645 const uint32_t * src,
646 const uint32_t * mask,
649 const uint32_t *end = dest + width;
655 x = load8888 (*dest);
656 a = load8888 (combine (src, mask));
657 a = expand_alpha (a);
659 x = pix_multiply (x, a);
661 *dest = store8888 (x);
672 mmx_combine_atop_u (pixman_implementation_t *imp,
675 const uint32_t * src,
676 const uint32_t * mask,
679 const uint32_t *end = dest + width;
685 s = load8888 (combine (src, mask));
686 d = load8888 (*dest);
687 sia = expand_alpha (s);
689 da = expand_alpha (d);
690 s = pix_add_mul (s, da, d, sia);
691 *dest = store8888 (s);
702 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
705 const uint32_t * src,
706 const uint32_t * mask,
717 s = load8888 (combine (src, mask));
718 d = load8888 (*dest);
719 sa = expand_alpha (s);
720 dia = expand_alpha (d);
722 s = pix_add_mul (s, dia, d, sa);
723 *dest = store8888 (s);
734 mmx_combine_xor_u (pixman_implementation_t *imp,
737 const uint32_t * src,
738 const uint32_t * mask,
741 const uint32_t *end = dest + width;
745 __m64 s, dia, d, sia;
747 s = load8888 (combine (src, mask));
748 d = load8888 (*dest);
749 sia = expand_alpha (s);
750 dia = expand_alpha (d);
753 s = pix_add_mul (s, dia, d, sia);
754 *dest = store8888 (s);
765 mmx_combine_add_u (pixman_implementation_t *imp,
768 const uint32_t * src,
769 const uint32_t * mask,
772 const uint32_t *end = dest + width;
778 s = load8888 (combine (src, mask));
779 d = load8888 (*dest);
781 *dest = store8888 (s);
792 mmx_combine_saturate_u (pixman_implementation_t *imp,
795 const uint32_t * src,
796 const uint32_t * mask,
799 const uint32_t *end = dest + width;
803 uint32_t s = combine (src, mask);
805 __m64 ms = load8888 (s);
806 __m64 md = load8888 (d);
807 uint32_t sa = s >> 24;
808 uint32_t da = ~d >> 24;
812 __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
813 msa = expand_alpha (msa);
814 ms = pix_multiply (ms, msa);
817 md = pix_add (md, ms);
818 *dest = store8888 (md);
829 mmx_combine_src_ca (pixman_implementation_t *imp,
832 const uint32_t * src,
833 const uint32_t * mask,
836 const uint32_t *end = src + width;
840 __m64 a = load8888 (*mask);
841 __m64 s = load8888 (*src);
843 s = pix_multiply (s, a);
844 *dest = store8888 (s);
854 mmx_combine_over_ca (pixman_implementation_t *imp,
857 const uint32_t * src,
858 const uint32_t * mask,
861 const uint32_t *end = src + width;
865 __m64 a = load8888 (*mask);
866 __m64 s = load8888 (*src);
867 __m64 d = load8888 (*dest);
868 __m64 sa = expand_alpha (s);
870 *dest = store8888 (in_over (s, sa, a, d));
880 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
883 const uint32_t * src,
884 const uint32_t * mask,
887 const uint32_t *end = src + width;
891 __m64 a = load8888 (*mask);
892 __m64 s = load8888 (*src);
893 __m64 d = load8888 (*dest);
894 __m64 da = expand_alpha (d);
896 *dest = store8888 (over (d, da, in (s, a)));
906 mmx_combine_in_ca (pixman_implementation_t *imp,
909 const uint32_t * src,
910 const uint32_t * mask,
913 const uint32_t *end = src + width;
917 __m64 a = load8888 (*mask);
918 __m64 s = load8888 (*src);
919 __m64 d = load8888 (*dest);
920 __m64 da = expand_alpha (d);
922 s = pix_multiply (s, a);
923 s = pix_multiply (s, da);
924 *dest = store8888 (s);
934 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
937 const uint32_t * src,
938 const uint32_t * mask,
941 const uint32_t *end = src + width;
945 __m64 a = load8888 (*mask);
946 __m64 s = load8888 (*src);
947 __m64 d = load8888 (*dest);
948 __m64 sa = expand_alpha (s);
950 a = pix_multiply (a, sa);
951 d = pix_multiply (d, a);
952 *dest = store8888 (d);
962 mmx_combine_out_ca (pixman_implementation_t *imp,
965 const uint32_t * src,
966 const uint32_t * mask,
969 const uint32_t *end = src + width;
973 __m64 a = load8888 (*mask);
974 __m64 s = load8888 (*src);
975 __m64 d = load8888 (*dest);
976 __m64 da = expand_alpha (d);
979 s = pix_multiply (s, a);
980 s = pix_multiply (s, da);
981 *dest = store8888 (s);
991 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
994 const uint32_t * src,
995 const uint32_t * mask,
998 const uint32_t *end = src + width;
1002 __m64 a = load8888 (*mask);
1003 __m64 s = load8888 (*src);
1004 __m64 d = load8888 (*dest);
1005 __m64 sa = expand_alpha (s);
1007 a = pix_multiply (a, sa);
1009 d = pix_multiply (d, a);
1010 *dest = store8888 (d);
1020 mmx_combine_atop_ca (pixman_implementation_t *imp,
1023 const uint32_t * src,
1024 const uint32_t * mask,
1027 const uint32_t *end = src + width;
1031 __m64 a = load8888 (*mask);
1032 __m64 s = load8888 (*src);
1033 __m64 d = load8888 (*dest);
1034 __m64 da = expand_alpha (d);
1035 __m64 sa = expand_alpha (s);
1037 s = pix_multiply (s, a);
1038 a = pix_multiply (a, sa);
1040 d = pix_add_mul (d, a, s, da);
1041 *dest = store8888 (d);
1051 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1054 const uint32_t * src,
1055 const uint32_t * mask,
1058 const uint32_t *end = src + width;
1062 __m64 a = load8888 (*mask);
1063 __m64 s = load8888 (*src);
1064 __m64 d = load8888 (*dest);
1065 __m64 da = expand_alpha (d);
1066 __m64 sa = expand_alpha (s);
1068 s = pix_multiply (s, a);
1069 a = pix_multiply (a, sa);
1071 d = pix_add_mul (d, a, s, da);
1072 *dest = store8888 (d);
1082 mmx_combine_xor_ca (pixman_implementation_t *imp,
1085 const uint32_t * src,
1086 const uint32_t * mask,
1089 const uint32_t *end = src + width;
1093 __m64 a = load8888 (*mask);
1094 __m64 s = load8888 (*src);
1095 __m64 d = load8888 (*dest);
1096 __m64 da = expand_alpha (d);
1097 __m64 sa = expand_alpha (s);
1099 s = pix_multiply (s, a);
1100 a = pix_multiply (a, sa);
1103 d = pix_add_mul (d, a, s, da);
1104 *dest = store8888 (d);
1114 mmx_combine_add_ca (pixman_implementation_t *imp,
1117 const uint32_t * src,
1118 const uint32_t * mask,
1121 const uint32_t *end = src + width;
1125 __m64 a = load8888 (*mask);
1126 __m64 s = load8888 (*src);
1127 __m64 d = load8888 (*dest);
1129 s = pix_multiply (s, a);
1131 *dest = store8888 (d);
1140 /* ------------- MMX code paths called from fbpict.c -------------------- */
1143 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1144 pixman_composite_info_t *info)
1146 PIXMAN_COMPOSITE_ARGS (info);
1148 uint32_t *dst_line, *dst;
1155 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1160 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1162 vsrc = load8888 (src);
1163 vsrca = expand_alpha (vsrc);
1168 dst_line += dst_stride;
1173 while (w && (unsigned long)dst & 7)
1175 *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1186 vdest = *(__m64 *)dst;
1188 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1189 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1191 *(__m64 *)dst = pack8888 (dest0, dest1);
1201 *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1209 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1210 pixman_composite_info_t *info)
1212 PIXMAN_COMPOSITE_ARGS (info);
1214 uint16_t *dst_line, *dst;
1221 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1226 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1228 vsrc = load8888 (src);
1229 vsrca = expand_alpha (vsrc);
1234 dst_line += dst_stride;
1239 while (w && (unsigned long)dst & 7)
1242 __m64 vdest = expand565 (to_m64 (d), 0);
1244 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1245 *dst = to_uint64 (vdest);
1255 vdest = *(__m64 *)dst;
1257 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
1258 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
1259 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
1260 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
1262 *(__m64 *)dst = vdest;
1273 __m64 vdest = expand565 (to_m64 (d), 0);
1275 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1276 *dst = to_uint64 (vdest);
1287 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1288 pixman_composite_info_t *info)
1290 PIXMAN_COMPOSITE_ARGS (info);
1293 uint32_t *mask_line;
1294 int dst_stride, mask_stride;
1299 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1304 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1305 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1307 vsrc = load8888 (src);
1308 vsrca = expand_alpha (vsrc);
1313 uint32_t *p = (uint32_t *)mask_line;
1314 uint32_t *q = (uint32_t *)dst_line;
1316 while (twidth && (unsigned long)q & 7)
1318 uint32_t m = *(uint32_t *)p;
1322 __m64 vdest = load8888 (*q);
1323 vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1324 *q = store8888 (vdest);
1341 __m64 vdest = *(__m64 *)q;
1343 dest0 = in_over (vsrc, vsrca, load8888 (m0),
1344 expand8888 (vdest, 0));
1345 dest1 = in_over (vsrc, vsrca, load8888 (m1),
1346 expand8888 (vdest, 1));
1348 *(__m64 *)q = pack8888 (dest0, dest1);
1358 uint32_t m = *(uint32_t *)p;
1362 __m64 vdest = load8888 (*q);
1363 vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1364 *q = store8888 (vdest);
1372 dst_line += dst_stride;
1373 mask_line += mask_stride;
1380 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1381 pixman_composite_info_t *info)
1383 PIXMAN_COMPOSITE_ARGS (info);
1384 uint32_t *dst_line, *dst;
1385 uint32_t *src_line, *src;
1388 int dst_stride, src_stride;
1393 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1394 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1396 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1398 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1399 vmask = load8888 (mask);
1404 dst_line += dst_stride;
1406 src_line += src_stride;
1409 while (w && (unsigned long)dst & 7)
1411 __m64 s = load8888 (*src);
1412 __m64 d = load8888 (*dst);
1414 *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1423 __m64 vs = ldq_u((uint64_t *)src);
1424 __m64 vd = *(__m64 *)dst;
1425 __m64 vsrc0 = expand8888 (vs, 0);
1426 __m64 vsrc1 = expand8888 (vs, 1);
1428 *(__m64 *)dst = pack8888 (
1429 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1430 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1439 __m64 s = load8888 (*src);
1440 __m64 d = load8888 (*dst);
1442 *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1450 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1451 pixman_composite_info_t *info)
1453 PIXMAN_COMPOSITE_ARGS (info);
1454 uint32_t *dst_line, *dst;
1455 uint32_t *src_line, *src;
1458 int dst_stride, src_stride;
1464 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1465 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1466 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1469 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1470 vmask = load8888 (mask);
1476 dst_line += dst_stride;
1478 src_line += src_stride;
1481 while (w && (unsigned long)dst & 7)
1483 __m64 s = load8888 (*src | 0xff000000);
1484 __m64 d = load8888 (*dst);
1486 *dst = store8888 (in_over (s, srca, vmask, d));
1495 __m64 vd0 = *(__m64 *)(dst + 0);
1496 __m64 vd1 = *(__m64 *)(dst + 2);
1497 __m64 vd2 = *(__m64 *)(dst + 4);
1498 __m64 vd3 = *(__m64 *)(dst + 6);
1499 __m64 vd4 = *(__m64 *)(dst + 8);
1500 __m64 vd5 = *(__m64 *)(dst + 10);
1501 __m64 vd6 = *(__m64 *)(dst + 12);
1502 __m64 vd7 = *(__m64 *)(dst + 14);
1504 __m64 vs0 = ldq_u((uint64_t *)(src + 0));
1505 __m64 vs1 = ldq_u((uint64_t *)(src + 2));
1506 __m64 vs2 = ldq_u((uint64_t *)(src + 4));
1507 __m64 vs3 = ldq_u((uint64_t *)(src + 6));
1508 __m64 vs4 = ldq_u((uint64_t *)(src + 8));
1509 __m64 vs5 = ldq_u((uint64_t *)(src + 10));
1510 __m64 vs6 = ldq_u((uint64_t *)(src + 12));
1511 __m64 vs7 = ldq_u((uint64_t *)(src + 14));
1514 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1515 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1518 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1519 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1522 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1523 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1526 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1527 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1530 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1531 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1534 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1535 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1538 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1539 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1542 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1543 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1545 *(__m64 *)(dst + 0) = vd0;
1546 *(__m64 *)(dst + 2) = vd1;
1547 *(__m64 *)(dst + 4) = vd2;
1548 *(__m64 *)(dst + 6) = vd3;
1549 *(__m64 *)(dst + 8) = vd4;
1550 *(__m64 *)(dst + 10) = vd5;
1551 *(__m64 *)(dst + 12) = vd6;
1552 *(__m64 *)(dst + 14) = vd7;
1561 __m64 s = load8888 (*src | 0xff000000);
1562 __m64 d = load8888 (*dst);
1564 *dst = store8888 (in_over (s, srca, vmask, d));
1576 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1577 pixman_composite_info_t *info)
1579 PIXMAN_COMPOSITE_ARGS (info);
1580 uint32_t *dst_line, *dst;
1581 uint32_t *src_line, *src;
1583 int dst_stride, src_stride;
1589 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1590 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1595 dst_line += dst_stride;
1597 src_line += src_stride;
1613 sa = expand_alpha (ms);
1614 *dst = store8888 (over (ms, sa, load8888 (*dst)));
1624 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1625 pixman_composite_info_t *info)
1627 PIXMAN_COMPOSITE_ARGS (info);
1628 uint16_t *dst_line, *dst;
1629 uint32_t *src_line, *src;
1630 int dst_stride, src_stride;
1635 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1636 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1640 assert (src_image->drawable == mask_image->drawable);
1646 dst_line += dst_stride;
1648 src_line += src_stride;
1653 while (w && (unsigned long)dst & 7)
1655 __m64 vsrc = load8888 (*src);
1657 __m64 vdest = expand565 (to_m64 (d), 0);
1660 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1662 *dst = to_uint64 (vdest);
1673 __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1676 vsrc0 = load8888 (*(src + 0));
1677 vsrc1 = load8888 (*(src + 1));
1678 vsrc2 = load8888 (*(src + 2));
1679 vsrc3 = load8888 (*(src + 3));
1681 vdest = *(__m64 *)dst;
1683 vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
1684 vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
1685 vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
1686 vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
1688 *(__m64 *)dst = vdest;
1699 __m64 vsrc = load8888 (*src);
1701 __m64 vdest = expand565 (to_m64 (d), 0);
1703 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1705 *dst = to_uint64 (vdest);
1717 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1718 pixman_composite_info_t *info)
1720 PIXMAN_COMPOSITE_ARGS (info);
1722 uint32_t *dst_line, *dst;
1723 uint8_t *mask_line, *mask;
1724 int dst_stride, mask_stride;
1731 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1737 srcsrc = (uint64_t)src << 32 | src;
1739 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1740 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1742 vsrc = load8888 (src);
1743 vsrca = expand_alpha (vsrc);
1748 dst_line += dst_stride;
1750 mask_line += mask_stride;
1755 while (w && (unsigned long)dst & 7)
1761 __m64 vdest = in_over (vsrc, vsrca,
1762 expand_alpha_rev (to_m64 (m)),
1765 *dst = store8888 (vdest);
1782 if (srca == 0xff && (m0 & m1) == 0xff)
1784 *(uint64_t *)dst = srcsrc;
1791 vdest = *(__m64 *)dst;
1793 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
1794 expand8888 (vdest, 0));
1795 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
1796 expand8888 (vdest, 1));
1798 *(__m64 *)dst = pack8888 (dest0, dest1);
1814 __m64 vdest = load8888 (*dst);
1817 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
1818 *dst = store8888 (vdest);
1827 pixman_fill_mmx (uint32_t *bits,
1838 uint32_t byte_width;
1841 #if defined __GNUC__ && defined USE_X86_MMX
1842 __m64 v1, v2, v3, v4, v5, v6, v7;
1845 if (bpp != 16 && bpp != 32 && bpp != 8)
1850 stride = stride * (int) sizeof (uint32_t) / 1;
1851 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
1854 xor = (xor & 0xff) * 0x01010101;
1858 stride = stride * (int) sizeof (uint32_t) / 2;
1859 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
1860 byte_width = 2 * width;
1862 xor = (xor & 0xffff) * 0x00010001;
1866 stride = stride * (int) sizeof (uint32_t) / 4;
1867 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
1868 byte_width = 4 * width;
1872 fill = ((uint64_t)xor << 32) | xor;
1873 vfill = to_m64 (fill);
1875 #if defined __GNUC__ && defined USE_X86_MMX
1884 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
1885 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
1892 uint8_t *d = byte_line;
1894 byte_line += stride;
1897 if (w >= 1 && ((unsigned long)d & 1))
1899 *(uint8_t *)d = (xor & 0xff);
1904 if (w >= 2 && ((unsigned long)d & 3))
1906 *(uint16_t *)d = xor;
1911 while (w >= 4 && ((unsigned long)d & 7))
1913 *(uint32_t *)d = xor;
1921 #if defined __GNUC__ && defined USE_X86_MMX
1933 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
1934 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
1937 *(__m64*) (d + 0) = vfill;
1938 *(__m64*) (d + 8) = vfill;
1939 *(__m64*) (d + 16) = vfill;
1940 *(__m64*) (d + 24) = vfill;
1941 *(__m64*) (d + 32) = vfill;
1942 *(__m64*) (d + 40) = vfill;
1943 *(__m64*) (d + 48) = vfill;
1944 *(__m64*) (d + 56) = vfill;
1952 *(uint32_t *)d = xor;
1959 *(uint16_t *)d = xor;
1965 *(uint8_t *)d = (xor & 0xff);
1977 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
1978 pixman_composite_info_t *info)
1980 PIXMAN_COMPOSITE_ARGS (info);
1982 uint32_t *dst_line, *dst;
1983 uint8_t *mask_line, *mask;
1984 int dst_stride, mask_stride;
1991 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1996 pixman_fill_mmx (dest_image->bits.bits, dest_image->bits.rowstride,
1997 PIXMAN_FORMAT_BPP (dest_image->bits.format),
1998 dest_x, dest_y, width, height, 0);
2002 srcsrc = (uint64_t)src << 32 | src;
2004 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2005 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2007 vsrc = load8888 (src);
2012 dst_line += dst_stride;
2014 mask_line += mask_stride;
2019 while (w && (unsigned long)dst & 7)
2025 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2027 *dst = store8888 (vdest);
2047 if (srca == 0xff && (m0 & m1) == 0xff)
2049 *(uint64_t *)dst = srcsrc;
2055 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2056 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2058 *(__m64 *)dst = pack8888 (dest0, dest1);
2062 *(uint64_t *)dst = 0;
2078 __m64 vdest = load8888 (*dst);
2080 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2081 *dst = store8888 (vdest);
2094 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2095 pixman_composite_info_t *info)
2097 PIXMAN_COMPOSITE_ARGS (info);
2099 uint16_t *dst_line, *dst;
2100 uint8_t *mask_line, *mask;
2101 int dst_stride, mask_stride;
2103 __m64 vsrc, vsrca, tmp;
2104 uint64_t srcsrcsrcsrc, src16;
2108 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2114 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2115 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2117 vsrc = load8888 (src);
2118 vsrca = expand_alpha (vsrc);
2120 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2121 src16 = to_uint64 (tmp);
2124 (uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
2125 (uint64_t)src16 << 16 | (uint64_t)src16;
2130 dst_line += dst_stride;
2132 mask_line += mask_stride;
2137 while (w && (unsigned long)dst & 7)
2144 __m64 vd = to_m64 (d);
2145 __m64 vdest = in_over (
2146 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2148 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2149 *dst = to_uint64 (vd);
2161 uint64_t m0, m1, m2, m3;
2167 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2169 *(uint64_t *)dst = srcsrcsrcsrc;
2171 else if (m0 | m1 | m2 | m3)
2174 __m64 vm0, vm1, vm2, vm3;
2176 vdest = *(__m64 *)dst;
2179 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
2180 expand565 (vdest, 0)), vdest, 0);
2182 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
2183 expand565 (vdest, 1)), vdest, 1);
2185 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
2186 expand565 (vdest, 2)), vdest, 2);
2188 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
2189 expand565 (vdest, 3)), vdest, 3);
2191 *(__m64 *)dst = vdest;
2208 __m64 vd = to_m64 (d);
2209 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2211 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2212 *dst = to_uint64 (vd);
2225 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2226 pixman_composite_info_t *info)
2228 PIXMAN_COMPOSITE_ARGS (info);
2229 uint16_t *dst_line, *dst;
2230 uint32_t *src_line, *src;
2231 int dst_stride, src_stride;
2236 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2237 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2241 assert (src_image->drawable == mask_image->drawable);
2247 dst_line += dst_stride;
2249 src_line += src_stride;
2254 while (w && (unsigned long)dst & 7)
2256 __m64 vsrc = load8888 (*src);
2258 __m64 vdest = expand565 (to_m64 (d), 0);
2260 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2262 *dst = to_uint64 (vdest);
2273 uint32_t s0, s1, s2, s3;
2274 unsigned char a0, a1, a2, a3;
2286 if ((a0 & a1 & a2 & a3) == 0xFF)
2289 vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
2290 vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
2291 vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
2292 vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
2294 *(__m64 *)dst = vdest;
2296 else if (s0 | s1 | s2 | s3)
2298 __m64 vdest = *(__m64 *)dst;
2300 vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
2301 vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
2302 vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
2303 vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
2305 *(__m64 *)dst = vdest;
2317 __m64 vsrc = load8888 (*src);
2319 __m64 vdest = expand565 (to_m64 (d), 0);
2321 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2323 *dst = to_uint64 (vdest);
2335 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2336 pixman_composite_info_t *info)
2338 PIXMAN_COMPOSITE_ARGS (info);
2339 uint32_t *dst_line, *dst;
2340 uint32_t *src_line, *src;
2341 int dst_stride, src_stride;
2346 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2347 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2351 assert (src_image->drawable == mask_image->drawable);
2357 dst_line += dst_stride;
2359 src_line += src_stride;
2362 while (w && (unsigned long)dst & 7)
2364 __m64 s = load8888 (*src);
2365 __m64 d = load8888 (*dst);
2367 *dst = store8888 (over_rev_non_pre (s, d));
2377 unsigned char a0, a1;
2386 if ((a0 & a1) == 0xFF)
2388 d0 = invert_colors (load8888 (s0));
2389 d1 = invert_colors (load8888 (s1));
2391 *(__m64 *)dst = pack8888 (d0, d1);
2395 __m64 vdest = *(__m64 *)dst;
2397 d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
2398 d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
2400 *(__m64 *)dst = pack8888 (d0, d1);
2410 __m64 s = load8888 (*src);
2411 __m64 d = load8888 (*dst);
2413 *dst = store8888 (over_rev_non_pre (s, d));
2421 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2422 pixman_composite_info_t *info)
2424 PIXMAN_COMPOSITE_ARGS (info);
2427 uint32_t *mask_line;
2428 int dst_stride, mask_stride;
2433 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2438 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2439 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2441 vsrc = load8888 (src);
2442 vsrca = expand_alpha (vsrc);
2447 uint32_t *p = (uint32_t *)mask_line;
2448 uint16_t *q = (uint16_t *)dst_line;
2450 while (twidth && ((unsigned long)q & 7))
2452 uint32_t m = *(uint32_t *)p;
2457 __m64 vdest = expand565 (to_m64 (d), 0);
2458 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2459 *q = to_uint64 (vdest);
2469 uint32_t m0, m1, m2, m3;
2476 if ((m0 | m1 | m2 | m3))
2478 __m64 vdest = *(__m64 *)q;
2480 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
2481 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
2482 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
2483 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
2485 *(__m64 *)q = vdest;
2500 __m64 vdest = expand565 (to_m64 (d), 0);
2501 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2502 *q = to_uint64 (vdest);
2510 mask_line += mask_stride;
2511 dst_line += dst_stride;
2518 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2519 pixman_composite_info_t *info)
2521 PIXMAN_COMPOSITE_ARGS (info);
2522 uint8_t *dst_line, *dst;
2523 uint8_t *mask_line, *mask;
2524 int dst_stride, mask_stride;
2530 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2531 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2533 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2537 vsrc = load8888 (src);
2538 vsrca = expand_alpha (vsrc);
2543 dst_line += dst_stride;
2545 mask_line += mask_stride;
2548 while (w && (unsigned long)dst & 7)
2557 m = MUL_UN8 (sa, a, tmp);
2558 d = MUL_UN8 (m, d, tmp);
2569 vmask = load8888 (ldl_u((uint32_t *)mask));
2570 vdest = load8888 (*(uint32_t *)dst);
2572 *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
2588 m = MUL_UN8 (sa, a, tmp);
2589 d = MUL_UN8 (m, d, tmp);
2599 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2600 pixman_composite_info_t *info)
2602 PIXMAN_COMPOSITE_ARGS (info);
2603 uint8_t *dst_line, *dst;
2604 uint8_t *src_line, *src;
2605 int src_stride, dst_stride;
2608 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2609 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2614 dst_line += dst_stride;
2616 src_line += src_stride;
2619 while (w && (unsigned long)dst & 3)
2627 *dst = MUL_UN8 (s, d, tmp);
2636 uint32_t *s = (uint32_t *)src;
2637 uint32_t *d = (uint32_t *)dst;
2639 *d = store8888 (in (load8888 (ldl_u((uint32_t *)s)), load8888 (*d)));
2654 *dst = MUL_UN8 (s, d, tmp);
2665 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2666 pixman_composite_info_t *info)
2668 PIXMAN_COMPOSITE_ARGS (info);
2669 uint8_t *dst_line, *dst;
2670 uint8_t *mask_line, *mask;
2671 int dst_stride, mask_stride;
2677 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2678 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2680 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2687 vsrc = load8888 (src);
2688 vsrca = expand_alpha (vsrc);
2693 dst_line += dst_stride;
2695 mask_line += mask_stride;
2698 while (w && (unsigned long)dst & 3)
2708 m = MUL_UN8 (sa, a, tmp);
2709 r = ADD_UN8 (m, d, tmp);
2720 vmask = load8888 (ldl_u((uint32_t *)mask));
2721 vdest = load8888 (*(uint32_t *)dst);
2723 *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
2740 m = MUL_UN8 (sa, a, tmp);
2741 r = ADD_UN8 (m, d, tmp);
2751 mmx_composite_add_8_8 (pixman_implementation_t *imp,
2752 pixman_composite_info_t *info)
2754 PIXMAN_COMPOSITE_ARGS (info);
2755 uint8_t *dst_line, *dst;
2756 uint8_t *src_line, *src;
2757 int dst_stride, src_stride;
2764 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2765 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2770 dst_line += dst_stride;
2772 src_line += src_stride;
2775 while (w && (unsigned long)dst & 7)
2780 s = t | (0 - (t >> 8));
2790 *(__m64*)dst = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
2801 s = t | (0 - (t >> 8));
2814 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2815 pixman_composite_info_t *info)
2817 PIXMAN_COMPOSITE_ARGS (info);
2819 uint32_t *dst_line, *dst;
2820 uint32_t *src_line, *src;
2821 int dst_stride, src_stride;
2826 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2827 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2832 dst_line += dst_stride;
2834 src_line += src_stride;
2837 while (w && (unsigned long)dst & 7)
2839 *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2840 _mm_cvtsi32_si64 (*dst)));
2848 dst64 = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
2849 *(uint64_t*)dst = to_uint64 (dst64);
2857 *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2858 _mm_cvtsi32_si64 (*dst)));
2866 static pixman_bool_t
2867 pixman_blt_mmx (uint32_t *src_bits,
2880 uint8_t * src_bytes;
2881 uint8_t * dst_bytes;
2884 if (src_bpp != dst_bpp)
2889 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
2890 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
2891 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
2892 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
2893 byte_width = 2 * width;
2897 else if (src_bpp == 32)
2899 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
2900 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
2901 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
2902 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
2903 byte_width = 4 * width;
2915 uint8_t *s = src_bytes;
2916 uint8_t *d = dst_bytes;
2917 src_bytes += src_stride;
2918 dst_bytes += dst_stride;
2921 if (w >= 1 && ((unsigned long)d & 1))
2923 *(uint8_t *)d = *(uint8_t *)s;
2929 if (w >= 2 && ((unsigned long)d & 3))
2931 *(uint16_t *)d = *(uint16_t *)s;
2937 while (w >= 4 && ((unsigned long)d & 7))
2939 *(uint32_t *)d = ldl_u((uint32_t *)s);
2948 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
2950 "movq (%1), %%mm0\n"
2951 "movq 8(%1), %%mm1\n"
2952 "movq 16(%1), %%mm2\n"
2953 "movq 24(%1), %%mm3\n"
2954 "movq 32(%1), %%mm4\n"
2955 "movq 40(%1), %%mm5\n"
2956 "movq 48(%1), %%mm6\n"
2957 "movq 56(%1), %%mm7\n"
2959 "movq %%mm0, (%0)\n"
2960 "movq %%mm1, 8(%0)\n"
2961 "movq %%mm2, 16(%0)\n"
2962 "movq %%mm3, 24(%0)\n"
2963 "movq %%mm4, 32(%0)\n"
2964 "movq %%mm5, 40(%0)\n"
2965 "movq %%mm6, 48(%0)\n"
2966 "movq %%mm7, 56(%0)\n"
2970 "%mm0", "%mm1", "%mm2", "%mm3",
2971 "%mm4", "%mm5", "%mm6", "%mm7");
2973 __m64 v0 = ldq_u((uint64_t *)(s + 0));
2974 __m64 v1 = ldq_u((uint64_t *)(s + 8));
2975 __m64 v2 = ldq_u((uint64_t *)(s + 16));
2976 __m64 v3 = ldq_u((uint64_t *)(s + 24));
2977 __m64 v4 = ldq_u((uint64_t *)(s + 32));
2978 __m64 v5 = ldq_u((uint64_t *)(s + 40));
2979 __m64 v6 = ldq_u((uint64_t *)(s + 48));
2980 __m64 v7 = ldq_u((uint64_t *)(s + 56));
2981 *(__m64 *)(d + 0) = v0;
2982 *(__m64 *)(d + 8) = v1;
2983 *(__m64 *)(d + 16) = v2;
2984 *(__m64 *)(d + 24) = v3;
2985 *(__m64 *)(d + 32) = v4;
2986 *(__m64 *)(d + 40) = v5;
2987 *(__m64 *)(d + 48) = v6;
2988 *(__m64 *)(d + 56) = v7;
2997 *(uint32_t *)d = ldl_u((uint32_t *)s);
3005 *(uint16_t *)d = *(uint16_t *)s;
3018 mmx_composite_copy_area (pixman_implementation_t *imp,
3019 pixman_composite_info_t *info)
3021 PIXMAN_COMPOSITE_ARGS (info);
3023 pixman_blt_mmx (src_image->bits.bits,
3024 dest_image->bits.bits,
3025 src_image->bits.rowstride,
3026 dest_image->bits.rowstride,
3027 PIXMAN_FORMAT_BPP (src_image->bits.format),
3028 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3029 src_x, src_y, dest_x, dest_y, width, height);
3032 #ifdef USE_ARM_IWMMXT
3034 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3035 pixman_composite_info_t *info)
3037 PIXMAN_COMPOSITE_ARGS (info);
3038 uint32_t *src, *src_line;
3039 uint32_t *dst, *dst_line;
3040 uint8_t *mask, *mask_line;
3041 int src_stride, mask_stride, dst_stride;
3044 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3045 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3046 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3051 src_line += src_stride;
3053 dst_line += dst_stride;
3055 mask_line += mask_stride;
3065 __m64 s = load8888 (*src | 0xff000000);
3069 *dst = store8888 (s);
3073 __m64 sa = expand_alpha (s);
3074 __m64 vm = expand_alpha_rev (to_m64 (m));
3075 __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
3077 *dst = store8888 (vdest);
3091 static const pixman_fast_path_t mmx_fast_paths[] =
3093 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
3094 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
3095 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
3096 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
3097 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
3098 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
3099 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3100 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3101 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
3102 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3103 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3104 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
3105 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3106 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3107 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
3108 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3109 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3110 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
3111 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
3112 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
3113 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
3114 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
3115 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
3116 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
3117 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
3118 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
3119 #ifdef USE_ARM_IWMMXT
3120 /* FIXME: This code is commented out since it's apparently
3121 * not actually faster than the generic code on x86.
3123 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
3124 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3125 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
3126 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
3128 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
3129 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
3130 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3131 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3132 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3134 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
3135 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
3136 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
3137 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
3138 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
3139 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
3141 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
3142 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
3143 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
3144 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
3146 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
3147 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
3148 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
3149 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
3150 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
3151 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
3152 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3153 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3154 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3155 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3156 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
3157 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
3159 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
3160 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
3165 static pixman_bool_t
3166 mmx_blt (pixman_implementation_t *imp,
3167 uint32_t * src_bits,
3168 uint32_t * dst_bits,
3180 if (!pixman_blt_mmx (
3181 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3182 src_x, src_y, dest_x, dest_y, width, height))
3185 return _pixman_implementation_blt (
3187 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3188 src_x, src_y, dest_x, dest_y, width, height);
3194 static pixman_bool_t
3195 mmx_fill (pixman_implementation_t *imp,
3205 if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
3207 return _pixman_implementation_fill (
3208 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
3214 pixman_implementation_t *
3215 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
3217 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
3219 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
3220 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
3221 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
3222 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
3223 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
3224 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
3225 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
3226 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
3227 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
3228 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
3229 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
3231 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
3232 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
3233 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
3234 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
3235 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
3236 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
3237 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
3238 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
3239 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
3240 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
3241 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
3244 imp->fill = mmx_fill;
3249 #endif /* USE_X86_MMX || USE_ARM_IWMMXT */