2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
29 * Based on work by Owen Taylor
39 #include "pixman-private.h"
40 #include "pixman-combine32.h"
45 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
50 /* Notes about writing mmx code
52 * give memory operands as the second operand. If you give it as the
53 * first, gcc will first load it into a register, then use that
58 * _mm_mullo_pi16 (x, mmx_constant);
62 * _mm_mullo_pi16 (mmx_constant, x);
64 * Also try to minimize dependencies. i.e. when you need a value, try
65 * to calculate it from a value that was calculated as early as
69 /* --------------- MMX primitives ------------------------------------- */
72 typedef uint64_t mmxdatafield;
74 typedef __m64 mmxdatafield;
75 /* If __m64 is defined as a struct or union, define M64_MEMBER to be the
76 name of the member used to access the data */
78 # define M64_MEMBER m64_u64
79 # elif defined(__SUNPRO_C)
80 # define M64_MEMBER l_
86 mmxdatafield mmx_4x00ff;
87 mmxdatafield mmx_4x0080;
88 mmxdatafield mmx_565_rgb;
89 mmxdatafield mmx_565_unpack_multiplier;
90 mmxdatafield mmx_565_r;
91 mmxdatafield mmx_565_g;
92 mmxdatafield mmx_565_b;
93 mmxdatafield mmx_mask_0;
94 mmxdatafield mmx_mask_1;
95 mmxdatafield mmx_mask_2;
96 mmxdatafield mmx_mask_3;
97 mmxdatafield mmx_full_alpha;
98 mmxdatafield mmx_ffff0000ffff0000;
99 mmxdatafield mmx_0000ffff00000000;
100 mmxdatafield mmx_000000000000ffff;
103 #if defined(_MSC_VER)
104 # define MMXDATA_INIT(field, val) { val ## UI64 }
105 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
106 # define MMXDATA_INIT(field, val) field = { val ## ULL }
107 #else /* __m64 is an integral type */
108 # define MMXDATA_INIT(field, val) field = val ## ULL
111 static const mmx_data_t c =
113 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
114 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
115 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
116 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
117 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
118 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
119 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
120 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
121 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
122 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
123 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
124 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
125 MMXDATA_INIT (.mmx_ffff0000ffff0000, 0xffff0000ffff0000),
126 MMXDATA_INIT (.mmx_0000ffff00000000, 0x0000ffff00000000),
127 MMXDATA_INIT (.mmx_000000000000ffff, 0x000000000000ffff),
132 # define MC(x) to_m64 (c.mmx_ ## x)
134 # define MC(x) ((__m64)c.mmx_ ## x)
137 # define MC(x) c.mmx_ ## x
140 static force_inline __m64
144 return _mm_cvtsi64_m64 (x);
145 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
150 #else /* __m64 is an integral type */
155 static force_inline uint64_t
159 return _mm_cvtm64_si64 (x);
160 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
161 uint64_t res = x.M64_MEMBER;
163 #else /* __m64 is an integral type */
168 static force_inline __m64
173 return _mm_slli_si64 (v, s);
175 return _mm_srli_si64 (v, -s);
180 static force_inline __m64
183 return _mm_xor_si64 (mask, MC (4x00ff));
186 static force_inline __m64
187 pix_multiply (__m64 a, __m64 b)
191 res = _mm_mullo_pi16 (a, b);
192 res = _mm_adds_pu16 (res, MC (4x0080));
193 res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
194 res = _mm_srli_pi16 (res, 8);
199 static force_inline __m64
200 pix_add (__m64 a, __m64 b)
202 return _mm_adds_pu8 (a, b);
205 static force_inline __m64
206 expand_alpha (__m64 pixel)
210 t1 = shift (pixel, -48);
212 t1 = _mm_or_si64 (t1, t2);
214 t1 = _mm_or_si64 (t1, t2);
219 static force_inline __m64
220 expand_alpha_rev (__m64 pixel)
224 /* move alpha to low 16 bits and zero the rest */
225 t1 = shift (pixel, 48);
226 t1 = shift (t1, -48);
229 t1 = _mm_or_si64 (t1, t2);
231 t1 = _mm_or_si64 (t1, t2);
236 static force_inline __m64
237 invert_colors (__m64 pixel)
243 x = _mm_and_si64 (x, MC (ffff0000ffff0000));
244 y = _mm_and_si64 (y, MC (000000000000ffff));
245 z = _mm_and_si64 (z, MC (0000ffff00000000));
250 x = _mm_or_si64 (x, y);
251 x = _mm_or_si64 (x, z);
256 static force_inline __m64
261 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
264 static force_inline __m64
265 over_rev_non_pre (__m64 src, __m64 dest)
267 __m64 srca = expand_alpha (src);
268 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
270 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
273 static force_inline __m64
274 in (__m64 src, __m64 mask)
276 return pix_multiply (src, mask);
279 static force_inline __m64
280 in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
282 src = _mm_or_si64 (src, MC (full_alpha));
284 return over (in (src, mask), mask, dest);
288 static force_inline __m64
289 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
291 return over (in (src, mask), pix_multiply (srca, mask), dest);
296 #define in_over(src, srca, mask, dest) \
297 over (in (src, mask), pix_multiply (srca, mask), dest)
301 static force_inline __m64
302 load8888 (uint32_t v)
304 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
307 static force_inline __m64
308 pack8888 (__m64 lo, __m64 hi)
310 return _mm_packs_pu16 (lo, hi);
313 static force_inline uint32_t
316 return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
319 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
323 * --- Expanding 565 in the low word ---
325 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
326 * m = m & (01f0003f001f);
327 * m = m * (008404100840);
330 * Note the trick here - the top word is shifted by another nibble to
331 * avoid it bumping into the middle word
333 static force_inline __m64
334 expand565 (__m64 pixel, int pos)
339 /* move pixel to low 16 bit and zero the rest */
340 p = shift (shift (p, (3 - pos) * 16), -48);
342 t1 = shift (p, 36 - 11);
343 t2 = shift (p, 16 - 5);
345 p = _mm_or_si64 (t1, p);
346 p = _mm_or_si64 (t2, p);
347 p = _mm_and_si64 (p, MC (565_rgb));
349 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
350 return _mm_srli_pi16 (pixel, 8);
353 static force_inline __m64
354 expand8888 (__m64 in, int pos)
357 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
359 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
362 static force_inline __m64
363 expandx888 (__m64 in, int pos)
365 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
368 static force_inline __m64
369 pack_565 (__m64 pixel, __m64 target, int pos)
375 r = _mm_and_si64 (p, MC (565_r));
376 g = _mm_and_si64 (p, MC (565_g));
377 b = _mm_and_si64 (p, MC (565_b));
379 r = shift (r, -(32 - 8) + pos * 16);
380 g = shift (g, -(16 - 3) + pos * 16);
381 b = shift (b, -(0 + 3) + pos * 16);
384 t = _mm_and_si64 (t, MC (mask_0));
386 t = _mm_and_si64 (t, MC (mask_1));
388 t = _mm_and_si64 (t, MC (mask_2));
390 t = _mm_and_si64 (t, MC (mask_3));
392 p = _mm_or_si64 (r, t);
393 p = _mm_or_si64 (g, p);
395 return _mm_or_si64 (b, p);
400 static force_inline __m64
401 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
403 x = pix_multiply (x, a);
404 y = pix_multiply (y, b);
406 return pix_add (x, y);
411 #define pix_add_mul(x, a, y, b) \
412 ( x = pix_multiply (x, a), \
413 y = pix_multiply (y, a), \
418 /* --------------- MMX code patch for fbcompose.c --------------------- */
420 static force_inline uint32_t
421 combine (const uint32_t *src, const uint32_t *mask)
423 uint32_t ssrc = *src;
427 __m64 m = load8888 (*mask);
428 __m64 s = load8888 (ssrc);
430 m = expand_alpha (m);
431 s = pix_multiply (s, m);
433 ssrc = store8888 (s);
440 mmx_combine_over_u (pixman_implementation_t *imp,
443 const uint32_t * src,
444 const uint32_t * mask,
447 const uint32_t *end = dest + width;
451 uint32_t ssrc = combine (src, mask);
452 uint32_t a = ssrc >> 24;
462 sa = expand_alpha (s);
463 *dest = store8888 (over (s, sa, load8888 (*dest)));
475 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
478 const uint32_t * src,
479 const uint32_t * mask,
482 const uint32_t *end = dest + width;
487 uint32_t s = combine (src, mask);
489 d = load8888 (*dest);
490 da = expand_alpha (d);
491 *dest = store8888 (over (d, da, load8888 (s)));
502 mmx_combine_in_u (pixman_implementation_t *imp,
505 const uint32_t * src,
506 const uint32_t * mask,
509 const uint32_t *end = dest + width;
515 x = load8888 (combine (src, mask));
516 a = load8888 (*dest);
517 a = expand_alpha (a);
518 x = pix_multiply (x, a);
520 *dest = store8888 (x);
531 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
534 const uint32_t * src,
535 const uint32_t * mask,
538 const uint32_t *end = dest + width;
544 x = load8888 (*dest);
545 a = load8888 (combine (src, mask));
546 a = expand_alpha (a);
547 x = pix_multiply (x, a);
548 *dest = store8888 (x);
559 mmx_combine_out_u (pixman_implementation_t *imp,
562 const uint32_t * src,
563 const uint32_t * mask,
566 const uint32_t *end = dest + width;
572 x = load8888 (combine (src, mask));
573 a = load8888 (*dest);
574 a = expand_alpha (a);
576 x = pix_multiply (x, a);
577 *dest = store8888 (x);
588 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
591 const uint32_t * src,
592 const uint32_t * mask,
595 const uint32_t *end = dest + width;
601 x = load8888 (*dest);
602 a = load8888 (combine (src, mask));
603 a = expand_alpha (a);
605 x = pix_multiply (x, a);
607 *dest = store8888 (x);
618 mmx_combine_atop_u (pixman_implementation_t *imp,
621 const uint32_t * src,
622 const uint32_t * mask,
625 const uint32_t *end = dest + width;
631 s = load8888 (combine (src, mask));
632 d = load8888 (*dest);
633 sia = expand_alpha (s);
635 da = expand_alpha (d);
636 s = pix_add_mul (s, da, d, sia);
637 *dest = store8888 (s);
648 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
651 const uint32_t * src,
652 const uint32_t * mask,
663 s = load8888 (combine (src, mask));
664 d = load8888 (*dest);
665 sa = expand_alpha (s);
666 dia = expand_alpha (d);
668 s = pix_add_mul (s, dia, d, sa);
669 *dest = store8888 (s);
680 mmx_combine_xor_u (pixman_implementation_t *imp,
683 const uint32_t * src,
684 const uint32_t * mask,
687 const uint32_t *end = dest + width;
691 __m64 s, dia, d, sia;
693 s = load8888 (combine (src, mask));
694 d = load8888 (*dest);
695 sia = expand_alpha (s);
696 dia = expand_alpha (d);
699 s = pix_add_mul (s, dia, d, sia);
700 *dest = store8888 (s);
711 mmx_combine_add_u (pixman_implementation_t *imp,
714 const uint32_t * src,
715 const uint32_t * mask,
718 const uint32_t *end = dest + width;
724 s = load8888 (combine (src, mask));
725 d = load8888 (*dest);
727 *dest = store8888 (s);
738 mmx_combine_saturate_u (pixman_implementation_t *imp,
741 const uint32_t * src,
742 const uint32_t * mask,
745 const uint32_t *end = dest + width;
749 uint32_t s = combine (src, mask);
751 __m64 ms = load8888 (s);
752 __m64 md = load8888 (d);
753 uint32_t sa = s >> 24;
754 uint32_t da = ~d >> 24;
758 __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
759 msa = expand_alpha (msa);
760 ms = pix_multiply (ms, msa);
763 md = pix_add (md, ms);
764 *dest = store8888 (md);
775 mmx_combine_src_ca (pixman_implementation_t *imp,
778 const uint32_t * src,
779 const uint32_t * mask,
782 const uint32_t *end = src + width;
786 __m64 a = load8888 (*mask);
787 __m64 s = load8888 (*src);
789 s = pix_multiply (s, a);
790 *dest = store8888 (s);
800 mmx_combine_over_ca (pixman_implementation_t *imp,
803 const uint32_t * src,
804 const uint32_t * mask,
807 const uint32_t *end = src + width;
811 __m64 a = load8888 (*mask);
812 __m64 s = load8888 (*src);
813 __m64 d = load8888 (*dest);
814 __m64 sa = expand_alpha (s);
816 *dest = store8888 (in_over (s, sa, a, d));
826 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
829 const uint32_t * src,
830 const uint32_t * mask,
833 const uint32_t *end = src + width;
837 __m64 a = load8888 (*mask);
838 __m64 s = load8888 (*src);
839 __m64 d = load8888 (*dest);
840 __m64 da = expand_alpha (d);
842 *dest = store8888 (over (d, da, in (s, a)));
852 mmx_combine_in_ca (pixman_implementation_t *imp,
855 const uint32_t * src,
856 const uint32_t * mask,
859 const uint32_t *end = src + width;
863 __m64 a = load8888 (*mask);
864 __m64 s = load8888 (*src);
865 __m64 d = load8888 (*dest);
866 __m64 da = expand_alpha (d);
868 s = pix_multiply (s, a);
869 s = pix_multiply (s, da);
870 *dest = store8888 (s);
880 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
883 const uint32_t * src,
884 const uint32_t * mask,
887 const uint32_t *end = src + width;
891 __m64 a = load8888 (*mask);
892 __m64 s = load8888 (*src);
893 __m64 d = load8888 (*dest);
894 __m64 sa = expand_alpha (s);
896 a = pix_multiply (a, sa);
897 d = pix_multiply (d, a);
898 *dest = store8888 (d);
908 mmx_combine_out_ca (pixman_implementation_t *imp,
911 const uint32_t * src,
912 const uint32_t * mask,
915 const uint32_t *end = src + width;
919 __m64 a = load8888 (*mask);
920 __m64 s = load8888 (*src);
921 __m64 d = load8888 (*dest);
922 __m64 da = expand_alpha (d);
925 s = pix_multiply (s, a);
926 s = pix_multiply (s, da);
927 *dest = store8888 (s);
937 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
940 const uint32_t * src,
941 const uint32_t * mask,
944 const uint32_t *end = src + width;
948 __m64 a = load8888 (*mask);
949 __m64 s = load8888 (*src);
950 __m64 d = load8888 (*dest);
951 __m64 sa = expand_alpha (s);
953 a = pix_multiply (a, sa);
955 d = pix_multiply (d, a);
956 *dest = store8888 (d);
966 mmx_combine_atop_ca (pixman_implementation_t *imp,
969 const uint32_t * src,
970 const uint32_t * mask,
973 const uint32_t *end = src + width;
977 __m64 a = load8888 (*mask);
978 __m64 s = load8888 (*src);
979 __m64 d = load8888 (*dest);
980 __m64 da = expand_alpha (d);
981 __m64 sa = expand_alpha (s);
983 s = pix_multiply (s, a);
984 a = pix_multiply (a, sa);
986 d = pix_add_mul (d, a, s, da);
987 *dest = store8888 (d);
997 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1000 const uint32_t * src,
1001 const uint32_t * mask,
1004 const uint32_t *end = src + width;
1008 __m64 a = load8888 (*mask);
1009 __m64 s = load8888 (*src);
1010 __m64 d = load8888 (*dest);
1011 __m64 da = expand_alpha (d);
1012 __m64 sa = expand_alpha (s);
1014 s = pix_multiply (s, a);
1015 a = pix_multiply (a, sa);
1017 d = pix_add_mul (d, a, s, da);
1018 *dest = store8888 (d);
1028 mmx_combine_xor_ca (pixman_implementation_t *imp,
1031 const uint32_t * src,
1032 const uint32_t * mask,
1035 const uint32_t *end = src + width;
1039 __m64 a = load8888 (*mask);
1040 __m64 s = load8888 (*src);
1041 __m64 d = load8888 (*dest);
1042 __m64 da = expand_alpha (d);
1043 __m64 sa = expand_alpha (s);
1045 s = pix_multiply (s, a);
1046 a = pix_multiply (a, sa);
1049 d = pix_add_mul (d, a, s, da);
1050 *dest = store8888 (d);
1060 mmx_combine_add_ca (pixman_implementation_t *imp,
1063 const uint32_t * src,
1064 const uint32_t * mask,
1067 const uint32_t *end = src + width;
1071 __m64 a = load8888 (*mask);
1072 __m64 s = load8888 (*src);
1073 __m64 d = load8888 (*dest);
1075 s = pix_multiply (s, a);
1077 *dest = store8888 (d);
1086 /* ------------- MMX code paths called from fbpict.c -------------------- */
1089 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1090 pixman_composite_info_t *info)
1092 PIXMAN_COMPOSITE_ARGS (info);
1094 uint32_t *dst_line, *dst;
1101 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1106 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1108 vsrc = load8888 (src);
1109 vsrca = expand_alpha (vsrc);
1114 dst_line += dst_stride;
1119 while (w && (unsigned long)dst & 7)
1121 *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1132 vdest = *(__m64 *)dst;
1134 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1135 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1137 *(__m64 *)dst = pack8888 (dest0, dest1);
1147 *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1158 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1159 pixman_composite_info_t *info)
1161 PIXMAN_COMPOSITE_ARGS (info);
1163 uint16_t *dst_line, *dst;
1170 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1175 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1177 vsrc = load8888 (src);
1178 vsrca = expand_alpha (vsrc);
1183 dst_line += dst_stride;
1188 while (w && (unsigned long)dst & 7)
1191 __m64 vdest = expand565 (to_m64 (d), 0);
1193 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1194 *dst = to_uint64 (vdest);
1204 vdest = *(__m64 *)dst;
1206 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
1207 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
1208 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
1209 vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
1211 *(__m64 *)dst = vdest;
1222 __m64 vdest = expand565 (to_m64 (d), 0);
1224 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1225 *dst = to_uint64 (vdest);
1236 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1237 pixman_composite_info_t *info)
1239 PIXMAN_COMPOSITE_ARGS (info);
1242 uint32_t *mask_line;
1243 int dst_stride, mask_stride;
1248 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1253 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1254 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1256 vsrc = load8888 (src);
1257 vsrca = expand_alpha (vsrc);
1262 uint32_t *p = (uint32_t *)mask_line;
1263 uint32_t *q = (uint32_t *)dst_line;
1265 while (twidth && (unsigned long)q & 7)
1267 uint32_t m = *(uint32_t *)p;
1271 __m64 vdest = load8888 (*q);
1272 vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1273 *q = store8888 (vdest);
1290 __m64 vdest = *(__m64 *)q;
1292 dest0 = in_over (vsrc, vsrca, load8888 (m0),
1293 expand8888 (vdest, 0));
1294 dest1 = in_over (vsrc, vsrca, load8888 (m1),
1295 expand8888 (vdest, 1));
1297 *(__m64 *)q = pack8888 (dest0, dest1);
1307 uint32_t m = *(uint32_t *)p;
1311 __m64 vdest = load8888 (*q);
1312 vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1313 *q = store8888 (vdest);
1321 dst_line += dst_stride;
1322 mask_line += mask_stride;
1329 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1330 pixman_composite_info_t *info)
1332 PIXMAN_COMPOSITE_ARGS (info);
1333 uint32_t *dst_line, *dst;
1334 uint32_t *src_line, *src;
1337 int dst_stride, src_stride;
1342 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1343 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1345 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1347 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1348 vmask = load8888 (mask);
1353 dst_line += dst_stride;
1355 src_line += src_stride;
1358 while (w && (unsigned long)dst & 7)
1360 __m64 s = load8888 (*src);
1361 __m64 d = load8888 (*dst);
1363 *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1372 __m64 vs = *(__m64 *)src;
1373 __m64 vd = *(__m64 *)dst;
1374 __m64 vsrc0 = expand8888 (vs, 0);
1375 __m64 vsrc1 = expand8888 (vs, 1);
1377 *(__m64 *)dst = pack8888 (
1378 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1379 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1388 __m64 s = load8888 (*src);
1389 __m64 d = load8888 (*dst);
1391 *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1403 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1404 pixman_composite_info_t *info)
1406 PIXMAN_COMPOSITE_ARGS (info);
1407 uint32_t *dst_line, *dst;
1408 uint32_t *src_line, *src;
1411 int dst_stride, src_stride;
1417 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1418 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1419 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1422 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1423 vmask = load8888 (mask);
1429 dst_line += dst_stride;
1431 src_line += src_stride;
1434 while (w && (unsigned long)dst & 7)
1436 __m64 s = load8888 (*src | 0xff000000);
1437 __m64 d = load8888 (*dst);
1439 *dst = store8888 (in_over (s, srca, vmask, d));
1448 __m64 vd0 = *(__m64 *)(dst + 0);
1449 __m64 vd1 = *(__m64 *)(dst + 2);
1450 __m64 vd2 = *(__m64 *)(dst + 4);
1451 __m64 vd3 = *(__m64 *)(dst + 6);
1452 __m64 vd4 = *(__m64 *)(dst + 8);
1453 __m64 vd5 = *(__m64 *)(dst + 10);
1454 __m64 vd6 = *(__m64 *)(dst + 12);
1455 __m64 vd7 = *(__m64 *)(dst + 14);
1457 __m64 vs0 = *(__m64 *)(src + 0);
1458 __m64 vs1 = *(__m64 *)(src + 2);
1459 __m64 vs2 = *(__m64 *)(src + 4);
1460 __m64 vs3 = *(__m64 *)(src + 6);
1461 __m64 vs4 = *(__m64 *)(src + 8);
1462 __m64 vs5 = *(__m64 *)(src + 10);
1463 __m64 vs6 = *(__m64 *)(src + 12);
1464 __m64 vs7 = *(__m64 *)(src + 14);
1467 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1468 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1471 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1472 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1475 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1476 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1479 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1480 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1483 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1484 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1487 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1488 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1491 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1492 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1495 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1496 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1498 *(__m64 *)(dst + 0) = vd0;
1499 *(__m64 *)(dst + 2) = vd1;
1500 *(__m64 *)(dst + 4) = vd2;
1501 *(__m64 *)(dst + 6) = vd3;
1502 *(__m64 *)(dst + 8) = vd4;
1503 *(__m64 *)(dst + 10) = vd5;
1504 *(__m64 *)(dst + 12) = vd6;
1505 *(__m64 *)(dst + 14) = vd7;
1514 __m64 s = load8888 (*src | 0xff000000);
1515 __m64 d = load8888 (*dst);
1517 *dst = store8888 (in_over (s, srca, vmask, d));
1529 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1530 pixman_composite_info_t *info)
1532 PIXMAN_COMPOSITE_ARGS (info);
1533 uint32_t *dst_line, *dst;
1534 uint32_t *src_line, *src;
1536 int dst_stride, src_stride;
1542 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1543 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1548 dst_line += dst_stride;
1550 src_line += src_stride;
1566 sa = expand_alpha (ms);
1567 *dst = store8888 (over (ms, sa, load8888 (*dst)));
1577 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1578 pixman_composite_info_t *info)
1580 PIXMAN_COMPOSITE_ARGS (info);
1581 uint16_t *dst_line, *dst;
1582 uint32_t *src_line, *src;
1583 int dst_stride, src_stride;
1588 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1589 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1593 assert (src_image->drawable == mask_image->drawable);
1599 dst_line += dst_stride;
1601 src_line += src_stride;
1606 while (w && (unsigned long)dst & 7)
1608 __m64 vsrc = load8888 (*src);
1610 __m64 vdest = expand565 (to_m64 (d), 0);
1613 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1615 *dst = to_uint64 (vdest);
1626 __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1629 vsrc0 = load8888 (*(src + 0));
1630 vsrc1 = load8888 (*(src + 1));
1631 vsrc2 = load8888 (*(src + 2));
1632 vsrc3 = load8888 (*(src + 3));
1634 vdest = *(__m64 *)dst;
1636 vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
1637 vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
1638 vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
1639 vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
1641 *(__m64 *)dst = vdest;
1652 __m64 vsrc = load8888 (*src);
1654 __m64 vdest = expand565 (to_m64 (d), 0);
1656 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1658 *dst = to_uint64 (vdest);
1670 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1671 pixman_composite_info_t *info)
1673 PIXMAN_COMPOSITE_ARGS (info);
1675 uint32_t *dst_line, *dst;
1676 uint8_t *mask_line, *mask;
1677 int dst_stride, mask_stride;
1684 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1690 srcsrc = (uint64_t)src << 32 | src;
1692 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1693 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1695 vsrc = load8888 (src);
1696 vsrca = expand_alpha (vsrc);
1701 dst_line += dst_stride;
1703 mask_line += mask_stride;
1708 while (w && (unsigned long)dst & 7)
1714 __m64 vdest = in_over (vsrc, vsrca,
1715 expand_alpha_rev (to_m64 (m)),
1718 *dst = store8888 (vdest);
1735 if (srca == 0xff && (m0 & m1) == 0xff)
1737 *(uint64_t *)dst = srcsrc;
1744 vdest = *(__m64 *)dst;
1746 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
1747 expand8888 (vdest, 0));
1748 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
1749 expand8888 (vdest, 1));
1751 *(__m64 *)dst = pack8888 (dest0, dest1);
1767 __m64 vdest = load8888 (*dst);
1770 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
1771 *dst = store8888 (vdest);
1784 pixman_fill_mmx (uint32_t *bits,
1795 uint32_t byte_width;
1799 __m64 v1, v2, v3, v4, v5, v6, v7;
1802 if (bpp != 16 && bpp != 32 && bpp != 8)
1807 stride = stride * (int) sizeof (uint32_t) / 1;
1808 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
1811 xor = (xor & 0xff) * 0x01010101;
1815 stride = stride * (int) sizeof (uint32_t) / 2;
1816 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
1817 byte_width = 2 * width;
1819 xor = (xor & 0xffff) * 0x00010001;
1823 stride = stride * (int) sizeof (uint32_t) / 4;
1824 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
1825 byte_width = 4 * width;
1829 fill = ((uint64_t)xor << 32) | xor;
1830 vfill = to_m64 (fill);
1841 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
1842 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
1849 uint8_t *d = byte_line;
1851 byte_line += stride;
1854 while (w >= 1 && ((unsigned long)d & 1))
1856 *(uint8_t *)d = (xor & 0xff);
1861 while (w >= 2 && ((unsigned long)d & 3))
1863 *(uint16_t *)d = xor;
1868 while (w >= 4 && ((unsigned long)d & 7))
1870 *(uint32_t *)d = xor;
1890 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
1891 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
1894 *(__m64*) (d + 0) = vfill;
1895 *(__m64*) (d + 8) = vfill;
1896 *(__m64*) (d + 16) = vfill;
1897 *(__m64*) (d + 24) = vfill;
1898 *(__m64*) (d + 32) = vfill;
1899 *(__m64*) (d + 40) = vfill;
1900 *(__m64*) (d + 48) = vfill;
1901 *(__m64*) (d + 56) = vfill;
1909 *(uint32_t *)d = xor;
1916 *(uint16_t *)d = xor;
1922 *(uint8_t *)d = (xor & 0xff);
1934 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
1935 pixman_composite_info_t *info)
1937 PIXMAN_COMPOSITE_ARGS (info);
1939 uint32_t *dst_line, *dst;
1940 uint8_t *mask_line, *mask;
1941 int dst_stride, mask_stride;
1948 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1953 pixman_fill_mmx (dest_image->bits.bits, dest_image->bits.rowstride,
1954 PIXMAN_FORMAT_BPP (dest_image->bits.format),
1955 dest_x, dest_y, width, height, 0);
1959 srcsrc = (uint64_t)src << 32 | src;
1961 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1962 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1964 vsrc = load8888 (src);
1969 dst_line += dst_stride;
1971 mask_line += mask_stride;
1976 while (w && (unsigned long)dst & 7)
1982 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
1984 *dst = store8888 (vdest);
2004 if (srca == 0xff && (m0 & m1) == 0xff)
2006 *(uint64_t *)dst = srcsrc;
2012 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2013 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2015 *(__m64 *)dst = pack8888 (dest0, dest1);
2019 *(uint64_t *)dst = 0;
2035 __m64 vdest = load8888 (*dst);
2037 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2038 *dst = store8888 (vdest);
2055 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2056 pixman_composite_info_t *info)
2058 PIXMAN_COMPOSITE_ARGS (info);
2060 uint16_t *dst_line, *dst;
2061 uint8_t *mask_line, *mask;
2062 int dst_stride, mask_stride;
2064 __m64 vsrc, vsrca, tmp;
2065 uint64_t srcsrcsrcsrc, src16;
2069 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2075 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2076 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2078 vsrc = load8888 (src);
2079 vsrca = expand_alpha (vsrc);
2081 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2082 src16 = to_uint64 (tmp);
2085 (uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
2086 (uint64_t)src16 << 16 | (uint64_t)src16;
2091 dst_line += dst_stride;
2093 mask_line += mask_stride;
2098 while (w && (unsigned long)dst & 7)
2105 __m64 vd = to_m64 (d);
2106 __m64 vdest = in_over (
2107 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2109 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2110 *dst = to_uint64 (vd);
2122 uint64_t m0, m1, m2, m3;
2128 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2130 *(uint64_t *)dst = srcsrcsrcsrc;
2132 else if (m0 | m1 | m2 | m3)
2135 __m64 vm0, vm1, vm2, vm3;
2137 vdest = *(__m64 *)dst;
2140 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
2141 expand565 (vdest, 0)), vdest, 0);
2143 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
2144 expand565 (vdest, 1)), vdest, 1);
2146 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
2147 expand565 (vdest, 2)), vdest, 2);
2149 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
2150 expand565 (vdest, 3)), vdest, 3);
2152 *(__m64 *)dst = vdest;
2169 __m64 vd = to_m64 (d);
2170 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2172 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2173 *dst = to_uint64 (vd);
2186 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2187 pixman_composite_info_t *info)
2189 PIXMAN_COMPOSITE_ARGS (info);
2190 uint16_t *dst_line, *dst;
2191 uint32_t *src_line, *src;
2192 int dst_stride, src_stride;
2197 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2198 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2202 assert (src_image->drawable == mask_image->drawable);
2208 dst_line += dst_stride;
2210 src_line += src_stride;
2215 while (w && (unsigned long)dst & 7)
2217 __m64 vsrc = load8888 (*src);
2219 __m64 vdest = expand565 (to_m64 (d), 0);
2221 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2223 *dst = to_uint64 (vdest);
2234 uint32_t s0, s1, s2, s3;
2235 unsigned char a0, a1, a2, a3;
2247 if ((a0 & a1 & a2 & a3) == 0xFF)
2250 vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
2251 vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
2252 vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
2253 vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
2255 *(__m64 *)dst = vdest;
2257 else if (s0 | s1 | s2 | s3)
2259 __m64 vdest = *(__m64 *)dst;
2261 vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
2262 vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
2263 vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
2264 vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
2266 *(__m64 *)dst = vdest;
2278 __m64 vsrc = load8888 (*src);
2280 __m64 vdest = expand565 (to_m64 (d), 0);
2282 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2284 *dst = to_uint64 (vdest);
2296 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2297 pixman_composite_info_t *info)
2299 PIXMAN_COMPOSITE_ARGS (info);
2300 uint32_t *dst_line, *dst;
2301 uint32_t *src_line, *src;
2302 int dst_stride, src_stride;
2307 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2308 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2312 assert (src_image->drawable == mask_image->drawable);
2318 dst_line += dst_stride;
2320 src_line += src_stride;
2323 while (w && (unsigned long)dst & 7)
2325 __m64 s = load8888 (*src);
2326 __m64 d = load8888 (*dst);
2328 *dst = store8888 (over_rev_non_pre (s, d));
2338 unsigned char a0, a1;
2347 if ((a0 & a1) == 0xFF)
2349 d0 = invert_colors (load8888 (s0));
2350 d1 = invert_colors (load8888 (s1));
2352 *(__m64 *)dst = pack8888 (d0, d1);
2356 __m64 vdest = *(__m64 *)dst;
2358 d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
2359 d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
2361 *(__m64 *)dst = pack8888 (d0, d1);
2371 __m64 s = load8888 (*src);
2372 __m64 d = load8888 (*dst);
2374 *dst = store8888 (over_rev_non_pre (s, d));
2386 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2387 pixman_composite_info_t *info)
2389 PIXMAN_COMPOSITE_ARGS (info);
2392 uint32_t *mask_line;
2393 int dst_stride, mask_stride;
2398 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2403 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2404 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2406 vsrc = load8888 (src);
2407 vsrca = expand_alpha (vsrc);
2412 uint32_t *p = (uint32_t *)mask_line;
2413 uint16_t *q = (uint16_t *)dst_line;
2415 while (twidth && ((unsigned long)q & 7))
2417 uint32_t m = *(uint32_t *)p;
2422 __m64 vdest = expand565 (to_m64 (d), 0);
2423 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2424 *q = to_uint64 (vdest);
2434 uint32_t m0, m1, m2, m3;
2441 if ((m0 | m1 | m2 | m3))
2443 __m64 vdest = *(__m64 *)q;
2445 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
2446 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
2447 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
2448 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
2450 *(__m64 *)q = vdest;
2465 __m64 vdest = expand565 (to_m64 (d), 0);
2466 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2467 *q = to_uint64 (vdest);
2475 mask_line += mask_stride;
2476 dst_line += dst_stride;
2483 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2484 pixman_composite_info_t *info)
2486 PIXMAN_COMPOSITE_ARGS (info);
2487 uint8_t *dst_line, *dst;
2488 uint8_t *mask_line, *mask;
2489 int dst_stride, mask_stride;
2495 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2496 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2498 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2502 vsrc = load8888 (src);
2503 vsrca = expand_alpha (vsrc);
2508 dst_line += dst_stride;
2510 mask_line += mask_stride;
2513 if ((((unsigned long)dest_image & 3) == 0) &&
2514 (((unsigned long)src_image & 3) == 0))
2521 vmask = load8888 (*(uint32_t *)mask);
2522 vdest = load8888 (*(uint32_t *)dst);
2524 *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
2541 m = MUL_UN8 (sa, a, tmp);
2542 d = MUL_UN8 (m, d, tmp);
2552 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2553 pixman_composite_info_t *info)
2555 PIXMAN_COMPOSITE_ARGS (info);
2556 uint8_t *dst_line, *dst;
2557 uint8_t *src_line, *src;
2558 int src_stride, dst_stride;
2561 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2562 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2567 dst_line += dst_stride;
2569 src_line += src_stride;
2572 if ((((unsigned long)dest_image & 3) == 0) &&
2573 (((unsigned long)src_image & 3) == 0))
2577 uint32_t *s = (uint32_t *)src;
2578 uint32_t *d = (uint32_t *)dst;
2580 *d = store8888 (in (load8888 (*s), load8888 (*d)));
2596 *dst = MUL_UN8 (s, d, tmp);
2607 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2608 pixman_composite_info_t *info)
2610 PIXMAN_COMPOSITE_ARGS (info);
2611 uint8_t *dst_line, *dst;
2612 uint8_t *mask_line, *mask;
2613 int dst_stride, mask_stride;
2619 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2620 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2622 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2629 vsrc = load8888 (src);
2630 vsrca = expand_alpha (vsrc);
2635 dst_line += dst_stride;
2637 mask_line += mask_stride;
2640 if ((((unsigned long)mask_image & 3) == 0) &&
2641 (((unsigned long)dest_image & 3) == 0))
2645 __m64 vmask = load8888 (*(uint32_t *)mask);
2646 __m64 vdest = load8888 (*(uint32_t *)dst);
2648 *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
2666 m = MUL_UN8 (sa, a, tmp);
2667 r = ADD_UN8 (m, d, tmp);
2677 mmx_composite_add_8_8 (pixman_implementation_t *imp,
2678 pixman_composite_info_t *info)
2680 PIXMAN_COMPOSITE_ARGS (info);
2681 uint8_t *dst_line, *dst;
2682 uint8_t *src_line, *src;
2683 int dst_stride, src_stride;
2690 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2691 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2696 dst_line += dst_stride;
2698 src_line += src_stride;
2701 while (w && (unsigned long)dst & 7)
2706 s = t | (0 - (t >> 8));
2716 *(__m64*)dst = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
2727 s = t | (0 - (t >> 8));
2740 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2741 pixman_composite_info_t *info)
2743 PIXMAN_COMPOSITE_ARGS (info);
2745 uint32_t *dst_line, *dst;
2746 uint32_t *src_line, *src;
2747 int dst_stride, src_stride;
2752 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2753 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2758 dst_line += dst_stride;
2760 src_line += src_stride;
2763 while (w && (unsigned long)dst & 7)
2765 *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2766 _mm_cvtsi32_si64 (*dst)));
2774 dst64 = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
2775 *(uint64_t*)dst = to_uint64 (dst64);
2783 *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2784 _mm_cvtsi32_si64 (*dst)));
2792 static pixman_bool_t
2793 pixman_blt_mmx (uint32_t *src_bits,
2806 uint8_t * src_bytes;
2807 uint8_t * dst_bytes;
2810 if (src_bpp != dst_bpp)
2815 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
2816 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
2817 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
2818 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
2819 byte_width = 2 * width;
2823 else if (src_bpp == 32)
2825 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
2826 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
2827 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
2828 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
2829 byte_width = 4 * width;
2841 uint8_t *s = src_bytes;
2842 uint8_t *d = dst_bytes;
2843 src_bytes += src_stride;
2844 dst_bytes += dst_stride;
2847 while (w >= 2 && ((unsigned long)d & 3))
2849 *(uint16_t *)d = *(uint16_t *)s;
2855 while (w >= 4 && ((unsigned long)d & 7))
2857 *(uint32_t *)d = *(uint32_t *)s;
2866 #if defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
2868 "movq (%1), %%mm0\n"
2869 "movq 8(%1), %%mm1\n"
2870 "movq 16(%1), %%mm2\n"
2871 "movq 24(%1), %%mm3\n"
2872 "movq 32(%1), %%mm4\n"
2873 "movq 40(%1), %%mm5\n"
2874 "movq 48(%1), %%mm6\n"
2875 "movq 56(%1), %%mm7\n"
2877 "movq %%mm0, (%0)\n"
2878 "movq %%mm1, 8(%0)\n"
2879 "movq %%mm2, 16(%0)\n"
2880 "movq %%mm3, 24(%0)\n"
2881 "movq %%mm4, 32(%0)\n"
2882 "movq %%mm5, 40(%0)\n"
2883 "movq %%mm6, 48(%0)\n"
2884 "movq %%mm7, 56(%0)\n"
2888 "%mm0", "%mm1", "%mm2", "%mm3",
2889 "%mm4", "%mm5", "%mm6", "%mm7");
2891 __m64 v0 = *(__m64 *)(s + 0);
2892 __m64 v1 = *(__m64 *)(s + 8);
2893 __m64 v2 = *(__m64 *)(s + 16);
2894 __m64 v3 = *(__m64 *)(s + 24);
2895 __m64 v4 = *(__m64 *)(s + 32);
2896 __m64 v5 = *(__m64 *)(s + 40);
2897 __m64 v6 = *(__m64 *)(s + 48);
2898 __m64 v7 = *(__m64 *)(s + 56);
2899 *(__m64 *)(d + 0) = v0;
2900 *(__m64 *)(d + 8) = v1;
2901 *(__m64 *)(d + 16) = v2;
2902 *(__m64 *)(d + 24) = v3;
2903 *(__m64 *)(d + 32) = v4;
2904 *(__m64 *)(d + 40) = v5;
2905 *(__m64 *)(d + 48) = v6;
2906 *(__m64 *)(d + 56) = v7;
2915 *(uint32_t *)d = *(uint32_t *)s;
2923 *(uint16_t *)d = *(uint16_t *)s;
2936 mmx_composite_copy_area (pixman_implementation_t *imp,
2937 pixman_composite_info_t *info)
2939 PIXMAN_COMPOSITE_ARGS (info);
2941 pixman_blt_mmx (src_image->bits.bits,
2942 dest_image->bits.bits,
2943 src_image->bits.rowstride,
2944 dest_image->bits.rowstride,
2945 PIXMAN_FORMAT_BPP (src_image->bits.format),
2946 PIXMAN_FORMAT_BPP (dest_image->bits.format),
2947 src_x, src_y, dest_x, dest_y, width, height);
2952 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
2953 pixman_composite_info_t *info)
2955 PIXMAN_COMPOSITE_ARGS (info);
2956 uint32_t *src, *src_line;
2957 uint32_t *dst, *dst_line;
2958 uint8_t *mask, *mask_line;
2959 int src_stride, mask_stride, dst_stride;
2962 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2963 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2964 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2969 src_line += src_stride;
2971 dst_line += dst_stride;
2973 mask_line += mask_stride;
2983 __m64 s = load8888 (*src | 0xff000000);
2987 *dst = store8888 (s);
2991 __m64 sa = expand_alpha (s);
2992 __m64 vm = expand_alpha_rev (to_m64 (m));
2993 __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
2995 *dst = store8888 (vdest);
3009 static const pixman_fast_path_t mmx_fast_paths[] =
3011 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
3012 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
3013 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
3014 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
3015 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
3016 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
3017 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3018 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3019 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
3020 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3021 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3022 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
3023 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3024 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3025 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
3026 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3027 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3028 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
3029 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
3030 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
3031 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
3032 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
3033 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
3034 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
3035 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
3036 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
3038 /* FIXME: This code is commented out since it's apparently
3039 * not actually faster than the generic code.
3041 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
3042 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3043 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
3044 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
3046 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
3047 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
3048 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3049 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3050 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3052 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
3053 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
3054 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
3055 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
3056 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
3057 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
3059 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
3060 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
3061 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
3062 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
3064 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
3065 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
3066 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
3067 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
3068 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
3069 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
3070 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3071 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3072 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3073 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3074 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
3075 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
3077 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
3078 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
3083 static pixman_bool_t
3084 mmx_blt (pixman_implementation_t *imp,
3085 uint32_t * src_bits,
3086 uint32_t * dst_bits,
3098 if (!pixman_blt_mmx (
3099 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3100 src_x, src_y, dest_x, dest_y, width, height))
3103 return _pixman_implementation_blt (
3105 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3106 src_x, src_y, dest_x, dest_y, width, height);
3112 static pixman_bool_t
3113 mmx_fill (pixman_implementation_t *imp,
3123 if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
3125 return _pixman_implementation_fill (
3126 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
3132 pixman_implementation_t *
3133 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
3135 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
3137 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
3138 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
3139 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
3140 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
3141 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
3142 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
3143 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
3144 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
3145 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
3146 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
3147 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
3149 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
3150 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
3151 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
3152 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
3153 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
3154 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
3155 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
3156 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
3157 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
3158 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
3159 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
3162 imp->fill = mmx_fill;
3167 #endif /* USE_MMX */