2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
29 * Based on work by Owen Taylor
39 #include "pixman-private.h"
40 #include "pixman-combine32.h"
45 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
50 /* Notes about writing mmx code
52 * give memory operands as the second operand. If you give it as the
53 * first, gcc will first load it into a register, then use that
58 * _mm_mullo_pi16 (x, mmx_constant);
62 * _mm_mullo_pi16 (mmx_constant, x);
64 * Also try to minimize dependencies. i.e. when you need a value, try
65 * to calculate it from a value that was calculated as early as
69 /* --------------- MMX primitives ------------------------------------- */
72 typedef uint64_t mmxdatafield;
74 typedef __m64 mmxdatafield;
75 /* If __m64 is defined as a struct or union, define M64_MEMBER to be the
76 name of the member used to access the data */
78 # define M64_MEMBER m64_u64
79 # elif defined(__SUNPRO_C)
80 # define M64_MEMBER l_
86 mmxdatafield mmx_4x00ff;
87 mmxdatafield mmx_4x0080;
88 mmxdatafield mmx_565_rgb;
89 mmxdatafield mmx_565_unpack_multiplier;
90 mmxdatafield mmx_565_r;
91 mmxdatafield mmx_565_g;
92 mmxdatafield mmx_565_b;
93 mmxdatafield mmx_mask_0;
94 mmxdatafield mmx_mask_1;
95 mmxdatafield mmx_mask_2;
96 mmxdatafield mmx_mask_3;
97 mmxdatafield mmx_full_alpha;
98 mmxdatafield mmx_ffff0000ffff0000;
99 mmxdatafield mmx_0000ffff00000000;
100 mmxdatafield mmx_000000000000ffff;
103 #if defined(_MSC_VER)
104 # define MMXDATA_INIT(field, val) { val ## UI64 }
105 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
106 # define MMXDATA_INIT(field, val) field = { val ## ULL }
107 #else /* __m64 is an integral type */
108 # define MMXDATA_INIT(field, val) field = val ## ULL
111 static const mmx_data_t c =
113 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
114 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
115 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
116 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
117 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
118 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
119 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
120 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
121 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
122 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
123 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
124 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
125 MMXDATA_INIT (.mmx_ffff0000ffff0000, 0xffff0000ffff0000),
126 MMXDATA_INIT (.mmx_0000ffff00000000, 0x0000ffff00000000),
127 MMXDATA_INIT (.mmx_000000000000ffff, 0x000000000000ffff),
132 # define MC(x) M64 (c.mmx_ ## x)
134 # define MC(x) ((__m64)c.mmx_ ## x)
137 # define MC(x) c.mmx_ ## x
140 static force_inline __m64
144 return _mm_cvtsi64_m64 (x);
145 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
150 #else /* __m64 is an integral type */
155 static force_inline uint64_t
159 return _mm_cvtm64_si64 (x);
160 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
161 uint64_t res = x.M64_MEMBER;
163 #else /* __m64 is an integral type */
168 static force_inline __m64
173 return _mm_slli_si64 (v, s);
175 return _mm_srli_si64 (v, -s);
180 static force_inline __m64
183 return _mm_xor_si64 (mask, MC (4x00ff));
186 static force_inline __m64
187 pix_multiply (__m64 a, __m64 b)
191 res = _mm_mullo_pi16 (a, b);
192 res = _mm_adds_pu16 (res, MC (4x0080));
193 res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
194 res = _mm_srli_pi16 (res, 8);
199 static force_inline __m64
200 pix_add (__m64 a, __m64 b)
202 return _mm_adds_pu8 (a, b);
205 static force_inline __m64
206 expand_alpha (__m64 pixel)
210 t1 = shift (pixel, -48);
212 t1 = _mm_or_si64 (t1, t2);
214 t1 = _mm_or_si64 (t1, t2);
219 static force_inline __m64
220 expand_alpha_rev (__m64 pixel)
224 /* move alpha to low 16 bits and zero the rest */
225 t1 = shift (pixel, 48);
226 t1 = shift (t1, -48);
229 t1 = _mm_or_si64 (t1, t2);
231 t1 = _mm_or_si64 (t1, t2);
236 static force_inline __m64
237 invert_colors (__m64 pixel)
243 x = _mm_and_si64 (x, MC (ffff0000ffff0000));
244 y = _mm_and_si64 (y, MC (000000000000ffff));
245 z = _mm_and_si64 (z, MC (0000ffff00000000));
250 x = _mm_or_si64 (x, y);
251 x = _mm_or_si64 (x, z);
256 static force_inline __m64
261 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
264 static force_inline __m64
265 over_rev_non_pre (__m64 src, __m64 dest)
267 __m64 srca = expand_alpha (src);
268 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
270 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
273 static force_inline __m64
274 in (__m64 src, __m64 mask)
276 return pix_multiply (src, mask);
279 static force_inline __m64
280 in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
282 src = _mm_or_si64 (src, MC (full_alpha));
284 return over (in (src, mask), mask, dest);
288 static force_inline __m64
289 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
291 return over (in (src, mask), pix_multiply (srca, mask), dest);
296 #define in_over(src, srca, mask, dest) \
297 over (in (src, mask), pix_multiply (srca, mask), dest)
301 static force_inline __m64
302 load8888 (uint32_t v)
304 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
307 static force_inline __m64
308 pack8888 (__m64 lo, __m64 hi)
310 return _mm_packs_pu16 (lo, hi);
313 static force_inline uint32_t
316 return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
319 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
323 * --- Expanding 565 in the low word ---
325 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
326 * m = m & (01f0003f001f);
327 * m = m * (008404100840);
330 * Note the trick here - the top word is shifted by another nibble to
331 * avoid it bumping into the middle word
333 static force_inline __m64
334 expand565 (__m64 pixel, int pos)
339 /* move pixel to low 16 bit and zero the rest */
340 p = shift (shift (p, (3 - pos) * 16), -48);
342 t1 = shift (p, 36 - 11);
343 t2 = shift (p, 16 - 5);
345 p = _mm_or_si64 (t1, p);
346 p = _mm_or_si64 (t2, p);
347 p = _mm_and_si64 (p, MC (565_rgb));
349 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
350 return _mm_srli_pi16 (pixel, 8);
353 static force_inline __m64
354 expand8888 (__m64 in, int pos)
357 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
359 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
362 static force_inline __m64
363 expandx888 (__m64 in, int pos)
365 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
368 static force_inline __m64
369 pack_565 (__m64 pixel, __m64 target, int pos)
375 r = _mm_and_si64 (p, MC (565_r));
376 g = _mm_and_si64 (p, MC (565_g));
377 b = _mm_and_si64 (p, MC (565_b));
379 r = shift (r, -(32 - 8) + pos * 16);
380 g = shift (g, -(16 - 3) + pos * 16);
381 b = shift (b, -(0 + 3) + pos * 16);
384 t = _mm_and_si64 (t, MC (mask_0));
386 t = _mm_and_si64 (t, MC (mask_1));
388 t = _mm_and_si64 (t, MC (mask_2));
390 t = _mm_and_si64 (t, MC (mask_3));
392 p = _mm_or_si64 (r, t);
393 p = _mm_or_si64 (g, p);
395 return _mm_or_si64 (b, p);
400 static force_inline __m64
401 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
403 x = pix_multiply (x, a);
404 y = pix_multiply (y, b);
406 return pix_add (x, y);
411 #define pix_add_mul(x, a, y, b) \
412 ( x = pix_multiply (x, a), \
413 y = pix_multiply (y, a), \
418 /* --------------- MMX code patch for fbcompose.c --------------------- */
420 static force_inline uint32_t
421 combine (const uint32_t *src, const uint32_t *mask)
423 uint32_t ssrc = *src;
427 __m64 m = load8888 (*mask);
428 __m64 s = load8888 (ssrc);
430 m = expand_alpha (m);
431 s = pix_multiply (s, m);
433 ssrc = store8888 (s);
440 mmx_combine_over_u (pixman_implementation_t *imp,
443 const uint32_t * src,
444 const uint32_t * mask,
447 const uint32_t *end = dest + width;
451 uint32_t ssrc = combine (src, mask);
452 uint32_t a = ssrc >> 24;
462 sa = expand_alpha (s);
463 *dest = store8888 (over (s, sa, load8888 (*dest)));
475 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
478 const uint32_t * src,
479 const uint32_t * mask,
482 const uint32_t *end = dest + width;
487 uint32_t s = combine (src, mask);
489 d = load8888 (*dest);
490 da = expand_alpha (d);
491 *dest = store8888 (over (d, da, load8888 (s)));
502 mmx_combine_in_u (pixman_implementation_t *imp,
505 const uint32_t * src,
506 const uint32_t * mask,
509 const uint32_t *end = dest + width;
515 x = load8888 (combine (src, mask));
516 a = load8888 (*dest);
517 a = expand_alpha (a);
518 x = pix_multiply (x, a);
520 *dest = store8888 (x);
531 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
534 const uint32_t * src,
535 const uint32_t * mask,
538 const uint32_t *end = dest + width;
544 x = load8888 (*dest);
545 a = load8888 (combine (src, mask));
546 a = expand_alpha (a);
547 x = pix_multiply (x, a);
548 *dest = store8888 (x);
559 mmx_combine_out_u (pixman_implementation_t *imp,
562 const uint32_t * src,
563 const uint32_t * mask,
566 const uint32_t *end = dest + width;
572 x = load8888 (combine (src, mask));
573 a = load8888 (*dest);
574 a = expand_alpha (a);
576 x = pix_multiply (x, a);
577 *dest = store8888 (x);
588 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
591 const uint32_t * src,
592 const uint32_t * mask,
595 const uint32_t *end = dest + width;
601 x = load8888 (*dest);
602 a = load8888 (combine (src, mask));
603 a = expand_alpha (a);
605 x = pix_multiply (x, a);
607 *dest = store8888 (x);
618 mmx_combine_atop_u (pixman_implementation_t *imp,
621 const uint32_t * src,
622 const uint32_t * mask,
625 const uint32_t *end = dest + width;
631 s = load8888 (combine (src, mask));
632 d = load8888 (*dest);
633 sia = expand_alpha (s);
635 da = expand_alpha (d);
636 s = pix_add_mul (s, da, d, sia);
637 *dest = store8888 (s);
648 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
651 const uint32_t * src,
652 const uint32_t * mask,
663 s = load8888 (combine (src, mask));
664 d = load8888 (*dest);
665 sa = expand_alpha (s);
666 dia = expand_alpha (d);
668 s = pix_add_mul (s, dia, d, sa);
669 *dest = store8888 (s);
680 mmx_combine_xor_u (pixman_implementation_t *imp,
683 const uint32_t * src,
684 const uint32_t * mask,
687 const uint32_t *end = dest + width;
691 __m64 s, dia, d, sia;
693 s = load8888 (combine (src, mask));
694 d = load8888 (*dest);
695 sia = expand_alpha (s);
696 dia = expand_alpha (d);
699 s = pix_add_mul (s, dia, d, sia);
700 *dest = store8888 (s);
711 mmx_combine_add_u (pixman_implementation_t *imp,
714 const uint32_t * src,
715 const uint32_t * mask,
718 const uint32_t *end = dest + width;
724 s = load8888 (combine (src, mask));
725 d = load8888 (*dest);
727 *dest = store8888 (s);
738 mmx_combine_saturate_u (pixman_implementation_t *imp,
741 const uint32_t * src,
742 const uint32_t * mask,
745 const uint32_t *end = dest + width;
749 uint32_t s = combine (src, mask);
751 __m64 ms = load8888 (s);
752 __m64 md = load8888 (d);
753 uint32_t sa = s >> 24;
754 uint32_t da = ~d >> 24;
758 __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
759 msa = expand_alpha (msa);
760 ms = pix_multiply (ms, msa);
763 md = pix_add (md, ms);
764 *dest = store8888 (md);
775 mmx_combine_src_ca (pixman_implementation_t *imp,
778 const uint32_t * src,
779 const uint32_t * mask,
782 const uint32_t *end = src + width;
786 __m64 a = load8888 (*mask);
787 __m64 s = load8888 (*src);
789 s = pix_multiply (s, a);
790 *dest = store8888 (s);
800 mmx_combine_over_ca (pixman_implementation_t *imp,
803 const uint32_t * src,
804 const uint32_t * mask,
807 const uint32_t *end = src + width;
811 __m64 a = load8888 (*mask);
812 __m64 s = load8888 (*src);
813 __m64 d = load8888 (*dest);
814 __m64 sa = expand_alpha (s);
816 *dest = store8888 (in_over (s, sa, a, d));
826 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
829 const uint32_t * src,
830 const uint32_t * mask,
833 const uint32_t *end = src + width;
837 __m64 a = load8888 (*mask);
838 __m64 s = load8888 (*src);
839 __m64 d = load8888 (*dest);
840 __m64 da = expand_alpha (d);
842 *dest = store8888 (over (d, da, in (s, a)));
852 mmx_combine_in_ca (pixman_implementation_t *imp,
855 const uint32_t * src,
856 const uint32_t * mask,
859 const uint32_t *end = src + width;
863 __m64 a = load8888 (*mask);
864 __m64 s = load8888 (*src);
865 __m64 d = load8888 (*dest);
866 __m64 da = expand_alpha (d);
868 s = pix_multiply (s, a);
869 s = pix_multiply (s, da);
870 *dest = store8888 (s);
880 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
883 const uint32_t * src,
884 const uint32_t * mask,
887 const uint32_t *end = src + width;
891 __m64 a = load8888 (*mask);
892 __m64 s = load8888 (*src);
893 __m64 d = load8888 (*dest);
894 __m64 sa = expand_alpha (s);
896 a = pix_multiply (a, sa);
897 d = pix_multiply (d, a);
898 *dest = store8888 (d);
908 mmx_combine_out_ca (pixman_implementation_t *imp,
911 const uint32_t * src,
912 const uint32_t * mask,
915 const uint32_t *end = src + width;
919 __m64 a = load8888 (*mask);
920 __m64 s = load8888 (*src);
921 __m64 d = load8888 (*dest);
922 __m64 da = expand_alpha (d);
925 s = pix_multiply (s, a);
926 s = pix_multiply (s, da);
927 *dest = store8888 (s);
937 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
940 const uint32_t * src,
941 const uint32_t * mask,
944 const uint32_t *end = src + width;
948 __m64 a = load8888 (*mask);
949 __m64 s = load8888 (*src);
950 __m64 d = load8888 (*dest);
951 __m64 sa = expand_alpha (s);
953 a = pix_multiply (a, sa);
955 d = pix_multiply (d, a);
956 *dest = store8888 (d);
966 mmx_combine_atop_ca (pixman_implementation_t *imp,
969 const uint32_t * src,
970 const uint32_t * mask,
973 const uint32_t *end = src + width;
977 __m64 a = load8888 (*mask);
978 __m64 s = load8888 (*src);
979 __m64 d = load8888 (*dest);
980 __m64 da = expand_alpha (d);
981 __m64 sa = expand_alpha (s);
983 s = pix_multiply (s, a);
984 a = pix_multiply (a, sa);
986 d = pix_add_mul (d, a, s, da);
987 *dest = store8888 (d);
997 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1000 const uint32_t * src,
1001 const uint32_t * mask,
1004 const uint32_t *end = src + width;
1008 __m64 a = load8888 (*mask);
1009 __m64 s = load8888 (*src);
1010 __m64 d = load8888 (*dest);
1011 __m64 da = expand_alpha (d);
1012 __m64 sa = expand_alpha (s);
1014 s = pix_multiply (s, a);
1015 a = pix_multiply (a, sa);
1017 d = pix_add_mul (d, a, s, da);
1018 *dest = store8888 (d);
1028 mmx_combine_xor_ca (pixman_implementation_t *imp,
1031 const uint32_t * src,
1032 const uint32_t * mask,
1035 const uint32_t *end = src + width;
1039 __m64 a = load8888 (*mask);
1040 __m64 s = load8888 (*src);
1041 __m64 d = load8888 (*dest);
1042 __m64 da = expand_alpha (d);
1043 __m64 sa = expand_alpha (s);
1045 s = pix_multiply (s, a);
1046 a = pix_multiply (a, sa);
1049 d = pix_add_mul (d, a, s, da);
1050 *dest = store8888 (d);
1060 mmx_combine_add_ca (pixman_implementation_t *imp,
1063 const uint32_t * src,
1064 const uint32_t * mask,
1067 const uint32_t *end = src + width;
1071 __m64 a = load8888 (*mask);
1072 __m64 s = load8888 (*src);
1073 __m64 d = load8888 (*dest);
1075 s = pix_multiply (s, a);
1077 *dest = store8888 (d);
1086 /* ------------- MMX code paths called from fbpict.c -------------------- */
1089 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1091 pixman_image_t * src_image,
1092 pixman_image_t * mask_image,
1093 pixman_image_t * dst_image,
1104 uint32_t *dst_line, *dst;
1111 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1116 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1118 vsrc = load8888 (src);
1119 vsrca = expand_alpha (vsrc);
1124 dst_line += dst_stride;
1129 while (w && (unsigned long)dst & 7)
1131 *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1142 vdest = *(__m64 *)dst;
1144 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1145 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1147 *(__m64 *)dst = pack8888 (dest0, dest1);
1157 *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1168 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1170 pixman_image_t * src_image,
1171 pixman_image_t * mask_image,
1172 pixman_image_t * dst_image,
1183 uint16_t *dst_line, *dst;
1190 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1195 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1197 vsrc = load8888 (src);
1198 vsrca = expand_alpha (vsrc);
1203 dst_line += dst_stride;
1208 while (w && (unsigned long)dst & 7)
1211 __m64 vdest = expand565 (M64 (d), 0);
1213 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1214 *dst = UINT64 (vdest);
1224 vdest = *(__m64 *)dst;
1226 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
1227 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
1228 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
1229 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
1231 *(__m64 *)dst = vdest;
1242 __m64 vdest = expand565 (M64 (d), 0);
1244 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1245 *dst = UINT64 (vdest);
1256 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1258 pixman_image_t * src_image,
1259 pixman_image_t * mask_image,
1260 pixman_image_t * dst_image,
1272 uint32_t *mask_line;
1273 int dst_stride, mask_stride;
1278 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1284 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1285 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1287 vsrc = load8888 (src);
1288 vsrca = expand_alpha (vsrc);
1293 uint32_t *p = (uint32_t *)mask_line;
1294 uint32_t *q = (uint32_t *)dst_line;
1296 while (twidth && (unsigned long)q & 7)
1298 uint32_t m = *(uint32_t *)p;
1302 __m64 vdest = load8888 (*q);
1303 vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1304 *q = store8888 (vdest);
1321 __m64 vdest = *(__m64 *)q;
1323 dest0 = in_over (vsrc, vsrca, load8888 (m0),
1324 expand8888 (vdest, 0));
1325 dest1 = in_over (vsrc, vsrca, load8888 (m1),
1326 expand8888 (vdest, 1));
1328 *(__m64 *)q = pack8888 (dest0, dest1);
1338 uint32_t m = *(uint32_t *)p;
1342 __m64 vdest = load8888 (*q);
1343 vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1344 *q = store8888 (vdest);
1352 dst_line += dst_stride;
1353 mask_line += mask_stride;
1360 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1362 pixman_image_t * src_image,
1363 pixman_image_t * mask_image,
1364 pixman_image_t * dst_image,
1374 uint32_t *dst_line, *dst;
1375 uint32_t *src_line, *src;
1378 int dst_stride, src_stride;
1384 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1385 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1387 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
1388 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1389 vmask = load8888 (mask);
1395 dst_line += dst_stride;
1397 src_line += src_stride;
1400 while (w && (unsigned long)dst & 7)
1402 __m64 s = load8888 (*src);
1403 __m64 d = load8888 (*dst);
1405 *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1414 __m64 vs = *(__m64 *)src;
1415 __m64 vd = *(__m64 *)dst;
1416 __m64 vsrc0 = expand8888 (vs, 0);
1417 __m64 vsrc1 = expand8888 (vs, 1);
1419 *(__m64 *)dst = pack8888 (
1420 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1421 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1430 __m64 s = load8888 (*src);
1431 __m64 d = load8888 (*dst);
1433 *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1445 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1447 pixman_image_t * src_image,
1448 pixman_image_t * mask_image,
1449 pixman_image_t * dst_image,
1459 uint32_t *dst_line, *dst;
1460 uint32_t *src_line, *src;
1463 int dst_stride, src_stride;
1469 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1470 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1471 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
1473 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1474 vmask = load8888 (mask);
1480 dst_line += dst_stride;
1482 src_line += src_stride;
1485 while (w && (unsigned long)dst & 7)
1487 __m64 s = load8888 (*src | 0xff000000);
1488 __m64 d = load8888 (*dst);
1490 *dst = store8888 (in_over (s, srca, vmask, d));
1499 __m64 vd0 = *(__m64 *)(dst + 0);
1500 __m64 vd1 = *(__m64 *)(dst + 2);
1501 __m64 vd2 = *(__m64 *)(dst + 4);
1502 __m64 vd3 = *(__m64 *)(dst + 6);
1503 __m64 vd4 = *(__m64 *)(dst + 8);
1504 __m64 vd5 = *(__m64 *)(dst + 10);
1505 __m64 vd6 = *(__m64 *)(dst + 12);
1506 __m64 vd7 = *(__m64 *)(dst + 14);
1508 __m64 vs0 = *(__m64 *)(src + 0);
1509 __m64 vs1 = *(__m64 *)(src + 2);
1510 __m64 vs2 = *(__m64 *)(src + 4);
1511 __m64 vs3 = *(__m64 *)(src + 6);
1512 __m64 vs4 = *(__m64 *)(src + 8);
1513 __m64 vs5 = *(__m64 *)(src + 10);
1514 __m64 vs6 = *(__m64 *)(src + 12);
1515 __m64 vs7 = *(__m64 *)(src + 14);
1518 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1519 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1522 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1523 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1526 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1527 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1530 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1531 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1534 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1535 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1538 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1539 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1542 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1543 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1546 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1547 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1549 *(__m64 *)(dst + 0) = vd0;
1550 *(__m64 *)(dst + 2) = vd1;
1551 *(__m64 *)(dst + 4) = vd2;
1552 *(__m64 *)(dst + 6) = vd3;
1553 *(__m64 *)(dst + 8) = vd4;
1554 *(__m64 *)(dst + 10) = vd5;
1555 *(__m64 *)(dst + 12) = vd6;
1556 *(__m64 *)(dst + 14) = vd7;
1565 __m64 s = load8888 (*src | 0xff000000);
1566 __m64 d = load8888 (*dst);
1568 *dst = store8888 (in_over (s, srca, vmask, d));
1580 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1582 pixman_image_t * src_image,
1583 pixman_image_t * mask_image,
1584 pixman_image_t * dst_image,
1594 uint32_t *dst_line, *dst;
1595 uint32_t *src_line, *src;
1597 int dst_stride, src_stride;
1603 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1604 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1609 dst_line += dst_stride;
1611 src_line += src_stride;
1627 sa = expand_alpha (ms);
1628 *dst = store8888 (over (ms, sa, load8888 (*dst)));
1638 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1640 pixman_image_t * src_image,
1641 pixman_image_t * mask_image,
1642 pixman_image_t * dst_image,
1652 uint16_t *dst_line, *dst;
1653 uint32_t *src_line, *src;
1654 int dst_stride, src_stride;
1659 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1660 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1664 assert (src_image->drawable == mask_image->drawable);
1670 dst_line += dst_stride;
1672 src_line += src_stride;
1677 while (w && (unsigned long)dst & 7)
1679 __m64 vsrc = load8888 (*src);
1681 __m64 vdest = expand565 (M64 (d), 0);
1684 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1686 *dst = UINT64 (vdest);
1697 __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1700 vsrc0 = load8888 (*(src + 0));
1701 vsrc1 = load8888 (*(src + 1));
1702 vsrc2 = load8888 (*(src + 2));
1703 vsrc3 = load8888 (*(src + 3));
1705 vdest = *(__m64 *)dst;
1707 vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
1708 vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
1709 vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
1710 vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
1712 *(__m64 *)dst = vdest;
1723 __m64 vsrc = load8888 (*src);
1725 __m64 vdest = expand565 (M64 (d), 0);
1727 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1729 *dst = UINT64 (vdest);
1741 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1743 pixman_image_t * src_image,
1744 pixman_image_t * mask_image,
1745 pixman_image_t * dst_image,
1756 uint32_t *dst_line, *dst;
1757 uint8_t *mask_line, *mask;
1758 int dst_stride, mask_stride;
1765 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1771 srcsrc = (uint64_t)src << 32 | src;
1773 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1774 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1776 vsrc = load8888 (src);
1777 vsrca = expand_alpha (vsrc);
1782 dst_line += dst_stride;
1784 mask_line += mask_stride;
1789 while (w && (unsigned long)dst & 7)
1795 __m64 vdest = in_over (vsrc, vsrca,
1796 expand_alpha_rev (M64 (m)),
1799 *dst = store8888 (vdest);
1816 if (srca == 0xff && (m0 & m1) == 0xff)
1818 *(uint64_t *)dst = srcsrc;
1825 vdest = *(__m64 *)dst;
1827 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (M64 (m0)),
1828 expand8888 (vdest, 0));
1829 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (M64 (m1)),
1830 expand8888 (vdest, 1));
1832 *(__m64 *)dst = pack8888 (dest0, dest1);
1848 __m64 vdest = load8888 (*dst);
1851 vsrc, vsrca, expand_alpha_rev (M64 (m)), vdest);
1852 *dst = store8888 (vdest);
1865 pixman_fill_mmx (uint32_t *bits,
1876 uint32_t byte_width;
1880 __m64 v1, v2, v3, v4, v5, v6, v7;
1883 if (bpp != 16 && bpp != 32 && bpp != 8)
1888 stride = stride * (int) sizeof (uint32_t) / 1;
1889 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
1892 xor = (xor & 0xff) * 0x01010101;
1896 stride = stride * (int) sizeof (uint32_t) / 2;
1897 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
1898 byte_width = 2 * width;
1900 xor = (xor & 0xffff) * 0x00010001;
1904 stride = stride * (int) sizeof (uint32_t) / 4;
1905 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
1906 byte_width = 4 * width;
1910 fill = ((uint64_t)xor << 32) | xor;
1922 : "=y" (v1), "=y" (v2), "=y" (v3),
1923 "=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7)
1930 uint8_t *d = byte_line;
1932 byte_line += stride;
1935 while (w >= 1 && ((unsigned long)d & 1))
1937 *(uint8_t *)d = (xor & 0xff);
1942 while (w >= 2 && ((unsigned long)d & 3))
1944 *(uint16_t *)d = xor;
1949 while (w >= 4 && ((unsigned long)d & 7))
1951 *(uint32_t *)d = xor;
1971 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
1972 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
1975 *(__m64*) (d + 0) = vfill;
1976 *(__m64*) (d + 8) = vfill;
1977 *(__m64*) (d + 16) = vfill;
1978 *(__m64*) (d + 24) = vfill;
1979 *(__m64*) (d + 32) = vfill;
1980 *(__m64*) (d + 40) = vfill;
1981 *(__m64*) (d + 48) = vfill;
1982 *(__m64*) (d + 56) = vfill;
1990 *(uint32_t *)d = xor;
1997 *(uint16_t *)d = xor;
2003 *(uint8_t *)d = (xor & 0xff);
2015 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2017 pixman_image_t * src_image,
2018 pixman_image_t * mask_image,
2019 pixman_image_t * dst_image,
2030 uint32_t *dst_line, *dst;
2031 uint8_t *mask_line, *mask;
2032 int dst_stride, mask_stride;
2039 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2044 pixman_fill_mmx (dst_image->bits.bits, dst_image->bits.rowstride,
2045 PIXMAN_FORMAT_BPP (dst_image->bits.format),
2046 dest_x, dest_y, width, height, 0);
2050 srcsrc = (uint64_t)src << 32 | src;
2052 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2053 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2055 vsrc = load8888 (src);
2056 vsrca = expand_alpha (vsrc);
2061 dst_line += dst_stride;
2063 mask_line += mask_stride;
2068 while (w && (unsigned long)dst & 7)
2074 __m64 vdest = in (vsrc, expand_alpha_rev (M64 (m)));
2076 *dst = store8888 (vdest);
2096 if (srca == 0xff && (m0 & m1) == 0xff)
2098 *(uint64_t *)dst = srcsrc;
2105 vdest = *(__m64 *)dst;
2107 dest0 = in (vsrc, expand_alpha_rev (M64 (m0)));
2108 dest1 = in (vsrc, expand_alpha_rev (M64 (m1)));
2110 *(__m64 *)dst = pack8888 (dest0, dest1);
2114 *(uint64_t *)dst = 0;
2130 __m64 vdest = load8888 (*dst);
2132 vdest = in (vsrc, expand_alpha_rev (M64 (m)));
2133 *dst = store8888 (vdest);
2150 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2152 pixman_image_t * src_image,
2153 pixman_image_t * mask_image,
2154 pixman_image_t * dst_image,
2165 uint16_t *dst_line, *dst;
2166 uint8_t *mask_line, *mask;
2167 int dst_stride, mask_stride;
2169 __m64 vsrc, vsrca, tmp;
2170 uint64_t srcsrcsrcsrc, src16;
2174 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2180 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2181 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2183 vsrc = load8888 (src);
2184 vsrca = expand_alpha (vsrc);
2186 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2187 src16 = UINT64 (tmp);
2190 (uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
2191 (uint64_t)src16 << 16 | (uint64_t)src16;
2196 dst_line += dst_stride;
2198 mask_line += mask_stride;
2203 while (w && (unsigned long)dst & 7)
2211 __m64 vdest = in_over (
2212 vsrc, vsrca, expand_alpha_rev (M64 (m)), expand565 (vd, 0));
2214 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2227 uint64_t m0, m1, m2, m3;
2233 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2235 *(uint64_t *)dst = srcsrcsrcsrc;
2237 else if (m0 | m1 | m2 | m3)
2240 __m64 vm0, vm1, vm2, vm3;
2242 vdest = *(__m64 *)dst;
2245 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
2246 expand565 (vdest, 0)), vdest, 0);
2248 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
2249 expand565 (vdest, 1)), vdest, 1);
2251 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
2252 expand565 (vdest, 2)), vdest, 2);
2254 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
2255 expand565 (vdest, 3)), vdest, 3);
2257 *(__m64 *)dst = vdest;
2275 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (M64 (m)),
2277 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2291 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2293 pixman_image_t * src_image,
2294 pixman_image_t * mask_image,
2295 pixman_image_t * dst_image,
2305 uint16_t *dst_line, *dst;
2306 uint32_t *src_line, *src;
2307 int dst_stride, src_stride;
2312 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2313 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2317 assert (src_image->drawable == mask_image->drawable);
2323 dst_line += dst_stride;
2325 src_line += src_stride;
2330 while (w && (unsigned long)dst & 7)
2332 __m64 vsrc = load8888 (*src);
2334 __m64 vdest = expand565 (M64 (d), 0);
2336 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2338 *dst = UINT64 (vdest);
2349 uint32_t s0, s1, s2, s3;
2350 unsigned char a0, a1, a2, a3;
2362 if ((a0 & a1 & a2 & a3) == 0xFF)
2365 vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
2366 vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
2367 vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
2368 vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
2370 *(__m64 *)dst = vdest;
2372 else if (s0 | s1 | s2 | s3)
2374 __m64 vdest = *(__m64 *)dst;
2376 vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
2377 vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
2378 vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
2379 vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
2381 *(__m64 *)dst = vdest;
2393 __m64 vsrc = load8888 (*src);
2395 __m64 vdest = expand565 (M64 (d), 0);
2397 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2399 *dst = UINT64 (vdest);
2411 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2413 pixman_image_t * src_image,
2414 pixman_image_t * mask_image,
2415 pixman_image_t * dst_image,
2425 uint32_t *dst_line, *dst;
2426 uint32_t *src_line, *src;
2427 int dst_stride, src_stride;
2432 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2433 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2437 assert (src_image->drawable == mask_image->drawable);
2443 dst_line += dst_stride;
2445 src_line += src_stride;
2448 while (w && (unsigned long)dst & 7)
2450 __m64 s = load8888 (*src);
2451 __m64 d = load8888 (*dst);
2453 *dst = store8888 (over_rev_non_pre (s, d));
2463 unsigned char a0, a1;
2472 if ((a0 & a1) == 0xFF)
2474 d0 = invert_colors (load8888 (s0));
2475 d1 = invert_colors (load8888 (s1));
2477 *(__m64 *)dst = pack8888 (d0, d1);
2481 __m64 vdest = *(__m64 *)dst;
2483 d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
2484 d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
2486 *(__m64 *)dst = pack8888 (d0, d1);
2496 __m64 s = load8888 (*src);
2497 __m64 d = load8888 (*dst);
2499 *dst = store8888 (over_rev_non_pre (s, d));
2511 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2513 pixman_image_t * src_image,
2514 pixman_image_t * mask_image,
2515 pixman_image_t * dst_image,
2527 uint32_t *mask_line;
2528 int dst_stride, mask_stride;
2533 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2539 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2540 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2542 vsrc = load8888 (src);
2543 vsrca = expand_alpha (vsrc);
2548 uint32_t *p = (uint32_t *)mask_line;
2549 uint16_t *q = (uint16_t *)dst_line;
2551 while (twidth && ((unsigned long)q & 7))
2553 uint32_t m = *(uint32_t *)p;
2558 __m64 vdest = expand565 (M64 (d), 0);
2559 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2560 *q = UINT64 (vdest);
2570 uint32_t m0, m1, m2, m3;
2577 if ((m0 | m1 | m2 | m3))
2579 __m64 vdest = *(__m64 *)q;
2581 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
2582 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
2583 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
2584 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
2586 *(__m64 *)q = vdest;
2601 __m64 vdest = expand565 (M64 (d), 0);
2602 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2603 *q = UINT64 (vdest);
2611 mask_line += mask_stride;
2612 dst_line += dst_stride;
2619 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2621 pixman_image_t * src_image,
2622 pixman_image_t * mask_image,
2623 pixman_image_t * dst_image,
2633 uint8_t *dst_line, *dst;
2634 uint8_t *mask_line, *mask;
2635 int dst_stride, mask_stride;
2641 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2642 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2644 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2648 vsrc = load8888 (src);
2649 vsrca = expand_alpha (vsrc);
2654 dst_line += dst_stride;
2656 mask_line += mask_stride;
2659 if ((((unsigned long)dst_image & 3) == 0) &&
2660 (((unsigned long)src_image & 3) == 0))
2670 vmask = load8888 (*(uint32_t *)mask);
2671 vdest = load8888 (*(uint32_t *)dst);
2673 *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
2690 m = MUL_UN8 (sa, a, tmp);
2691 d = MUL_UN8 (m, d, tmp);
2701 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2703 pixman_image_t * src_image,
2704 pixman_image_t * mask_image,
2705 pixman_image_t * dst_image,
2715 uint8_t *dst_line, *dst;
2716 uint8_t *src_line, *src;
2717 int src_stride, dst_stride;
2720 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2721 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2726 dst_line += dst_stride;
2728 src_line += src_stride;
2731 if ((((unsigned long)dst_image & 3) == 0) &&
2732 (((unsigned long)src_image & 3) == 0))
2736 uint32_t *s = (uint32_t *)src;
2737 uint32_t *d = (uint32_t *)dst;
2739 *d = store8888 (in (load8888 (*s), load8888 (*d)));
2755 *dst = MUL_UN8 (s, d, tmp);
2766 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2768 pixman_image_t * src_image,
2769 pixman_image_t * mask_image,
2770 pixman_image_t * dst_image,
2780 uint8_t *dst_line, *dst;
2781 uint8_t *mask_line, *mask;
2782 int dst_stride, mask_stride;
2788 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2789 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2791 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2798 vsrc = load8888 (src);
2799 vsrca = expand_alpha (vsrc);
2804 dst_line += dst_stride;
2806 mask_line += mask_stride;
2809 if ((((unsigned long)mask_image & 3) == 0) &&
2810 (((unsigned long)dst_image & 3) == 0))
2814 __m64 vmask = load8888 (*(uint32_t *)mask);
2815 __m64 vdest = load8888 (*(uint32_t *)dst);
2817 *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
2835 m = MUL_UN8 (sa, a, tmp);
2836 r = ADD_UN8 (m, d, tmp);
2846 mmx_composite_add_8000_8000 (pixman_implementation_t *imp,
2848 pixman_image_t * src_image,
2849 pixman_image_t * mask_image,
2850 pixman_image_t * dst_image,
2860 uint8_t *dst_line, *dst;
2861 uint8_t *src_line, *src;
2862 int dst_stride, src_stride;
2869 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2870 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2875 dst_line += dst_stride;
2877 src_line += src_stride;
2880 while (w && (unsigned long)dst & 7)
2885 s = t | (0 - (t >> 8));
2895 *(__m64*)dst = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
2906 s = t | (0 - (t >> 8));
2919 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2921 pixman_image_t * src_image,
2922 pixman_image_t * mask_image,
2923 pixman_image_t * dst_image,
2934 uint32_t *dst_line, *dst;
2935 uint32_t *src_line, *src;
2936 int dst_stride, src_stride;
2941 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2942 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2947 dst_line += dst_stride;
2949 src_line += src_stride;
2952 while (w && (unsigned long)dst & 7)
2954 *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2955 _mm_cvtsi32_si64 (*dst)));
2963 dst64 = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
2964 *(uint64_t*)dst = UINT64 (dst64);
2972 *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2973 _mm_cvtsi32_si64 (*dst)));
2981 static pixman_bool_t
2982 pixman_blt_mmx (uint32_t *src_bits,
2995 uint8_t * src_bytes;
2996 uint8_t * dst_bytes;
2999 if (src_bpp != dst_bpp)
3004 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3005 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3006 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3007 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
3008 byte_width = 2 * width;
3012 else if (src_bpp == 32)
3014 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3015 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3016 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3017 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
3018 byte_width = 4 * width;
3030 uint8_t *s = src_bytes;
3031 uint8_t *d = dst_bytes;
3032 src_bytes += src_stride;
3033 dst_bytes += dst_stride;
3036 while (w >= 2 && ((unsigned long)d & 3))
3038 *(uint16_t *)d = *(uint16_t *)s;
3044 while (w >= 4 && ((unsigned long)d & 7))
3046 *(uint32_t *)d = *(uint32_t *)s;
3055 #if defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
3057 "movq (%1), %%mm0\n"
3058 "movq 8(%1), %%mm1\n"
3059 "movq 16(%1), %%mm2\n"
3060 "movq 24(%1), %%mm3\n"
3061 "movq 32(%1), %%mm4\n"
3062 "movq 40(%1), %%mm5\n"
3063 "movq 48(%1), %%mm6\n"
3064 "movq 56(%1), %%mm7\n"
3066 "movq %%mm0, (%0)\n"
3067 "movq %%mm1, 8(%0)\n"
3068 "movq %%mm2, 16(%0)\n"
3069 "movq %%mm3, 24(%0)\n"
3070 "movq %%mm4, 32(%0)\n"
3071 "movq %%mm5, 40(%0)\n"
3072 "movq %%mm6, 48(%0)\n"
3073 "movq %%mm7, 56(%0)\n"
3077 "%mm0", "%mm1", "%mm2", "%mm3",
3078 "%mm4", "%mm5", "%mm6", "%mm7");
3080 __m64 v0 = *(__m64 *)(s + 0);
3081 __m64 v1 = *(__m64 *)(s + 8);
3082 __m64 v2 = *(__m64 *)(s + 16);
3083 __m64 v3 = *(__m64 *)(s + 24);
3084 __m64 v4 = *(__m64 *)(s + 32);
3085 __m64 v5 = *(__m64 *)(s + 40);
3086 __m64 v6 = *(__m64 *)(s + 48);
3087 __m64 v7 = *(__m64 *)(s + 56);
3088 *(__m64 *)(d + 0) = v0;
3089 *(__m64 *)(d + 8) = v1;
3090 *(__m64 *)(d + 16) = v2;
3091 *(__m64 *)(d + 24) = v3;
3092 *(__m64 *)(d + 32) = v4;
3093 *(__m64 *)(d + 40) = v5;
3094 *(__m64 *)(d + 48) = v6;
3095 *(__m64 *)(d + 56) = v7;
3104 *(uint32_t *)d = *(uint32_t *)s;
3112 *(uint16_t *)d = *(uint16_t *)s;
3125 mmx_composite_copy_area (pixman_implementation_t *imp,
3127 pixman_image_t * src_image,
3128 pixman_image_t * mask_image,
3129 pixman_image_t * dst_image,
3139 pixman_blt_mmx (src_image->bits.bits,
3140 dst_image->bits.bits,
3141 src_image->bits.rowstride,
3142 dst_image->bits.rowstride,
3143 PIXMAN_FORMAT_BPP (src_image->bits.format),
3144 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3145 src_x, src_y, dest_x, dest_y, width, height);
3150 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3152 pixman_image_t * src_image,
3153 pixman_image_t * mask_image,
3154 pixman_image_t * dst_image,
3164 uint32_t *src, *src_line;
3165 uint32_t *dst, *dst_line;
3166 uint8_t *mask, *mask_line;
3167 int src_stride, mask_stride, dst_stride;
3170 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3171 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3172 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3177 src_line += src_stride;
3179 dst_line += dst_stride;
3181 mask_line += mask_stride;
3191 __m64 s = load8888 (*src | 0xff000000);
3195 *dst = store8888 (s);
3199 __m64 sa = expand_alpha (s);
3200 __m64 vm = expand_alpha_rev (M64 (m));
3201 __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
3203 *dst = store8888 (vdest);
3217 static const pixman_fast_path_t mmx_fast_paths[] =
3219 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
3220 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
3221 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
3222 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
3223 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
3224 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
3225 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3226 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3227 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
3228 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3229 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3230 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
3231 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3232 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3233 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
3234 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3235 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3236 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
3237 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
3238 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
3239 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
3240 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
3241 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
3242 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
3243 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
3244 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
3246 /* FIXME: This code is commented out since it's apparently
3247 * not actually faster than the generic code.
3249 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
3250 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3251 PIXMAN_STD_FAST_PATH (OVER, x8b8r8g8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
3252 PIXMAN_STD_FAST_PATH (OVER, x8b8r8g8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3254 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
3255 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
3256 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3257 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3258 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3260 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
3261 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
3262 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
3263 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
3264 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
3265 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
3267 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
3268 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
3269 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8000_8000 ),
3270 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
3272 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
3273 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
3274 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
3275 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
3276 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
3277 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
3278 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3279 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3280 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3281 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3282 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
3283 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
3285 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
3286 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
3291 static pixman_bool_t
3292 mmx_blt (pixman_implementation_t *imp,
3293 uint32_t * src_bits,
3294 uint32_t * dst_bits,
3306 if (!pixman_blt_mmx (
3307 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3308 src_x, src_y, dst_x, dst_y, width, height))
3311 return _pixman_implementation_blt (
3313 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3314 src_x, src_y, dst_x, dst_y, width, height);
3320 static pixman_bool_t
3321 mmx_fill (pixman_implementation_t *imp,
3331 if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
3333 return _pixman_implementation_fill (
3334 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
3340 pixman_implementation_t *
3341 _pixman_implementation_create_mmx (void)
3343 pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
3344 pixman_implementation_t *imp = _pixman_implementation_create (general, mmx_fast_paths);
3346 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
3347 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
3348 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
3349 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
3350 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
3351 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
3352 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
3353 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
3354 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
3355 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
3356 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
3358 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
3359 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
3360 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
3361 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
3362 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
3363 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
3364 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
3365 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
3366 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
3367 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
3368 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
3371 imp->fill = mmx_fill;
3376 #endif /* USE_MMX */