2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
29 * Based on work by Owen Taylor
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
49 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
55 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
56 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
64 # if (defined(__SUNPRO_C) || defined(_MSC_VER))
65 # include <xmmintrin.h>
67 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
68 * instructions to be generated that we don't want. Just duplicate the
69 * functions we want to use. */
70 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
71 _mm_movemask_pi8 (__m64 __A)
75 asm ("pmovmskb %1, %0\n\t"
83 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
84 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
86 asm ("pmulhuw %1, %0\n\t"
94 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
99 asm ("pshufw %2, %1, %0\n\t"
101 : "y" (__A), "K" (__N)
107 # define _mm_shuffle_pi16(A, N) \
108 ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
114 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
115 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
118 /* Notes about writing mmx code
120 * give memory operands as the second operand. If you give it as the
121 * first, gcc will first load it into a register, then use that
126 * _mm_mullo_pi16 (x, mmx_constant);
130 * _mm_mullo_pi16 (mmx_constant, x);
132 * Also try to minimize dependencies. i.e. when you need a value, try
133 * to calculate it from a value that was calculated as early as
137 /* --------------- MMX primitives ------------------------------------- */
139 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
140 * the name of the member used to access the data.
141 * If __m64 requires using mm_cvt* intrinsics functions to convert between
142 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
143 * If __m64 and uint64_t values can just be cast to each other directly,
144 * then define USE_M64_CASTS.
145 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
148 # define M64_MEMBER m64_u64
150 # define USE_CVT_INTRINSICS
151 #elif defined(USE_LOONGSON_MMI)
152 # define USE_M64_DOUBLE
153 #elif defined(__GNUC__)
154 # define USE_M64_CASTS
155 #elif defined(__SUNPRO_C)
156 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
157 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
158 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
159 * is defined. If it is used, then the mm_cvt* intrinsics must be used.
161 # define USE_CVT_INTRINSICS
163 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
164 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
166 # define M64_MEMBER l_
170 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
171 typedef uint64_t mmxdatafield;
173 typedef __m64 mmxdatafield;
178 mmxdatafield mmx_4x00ff;
179 mmxdatafield mmx_4x0080;
180 mmxdatafield mmx_565_rgb;
181 mmxdatafield mmx_565_unpack_multiplier;
182 mmxdatafield mmx_565_pack_multiplier;
183 mmxdatafield mmx_565_r;
184 mmxdatafield mmx_565_g;
185 mmxdatafield mmx_565_b;
186 mmxdatafield mmx_packed_565_rb;
187 mmxdatafield mmx_packed_565_g;
188 mmxdatafield mmx_expand_565_g;
189 mmxdatafield mmx_expand_565_b;
190 mmxdatafield mmx_expand_565_r;
191 #ifndef USE_LOONGSON_MMI
192 mmxdatafield mmx_mask_0;
193 mmxdatafield mmx_mask_1;
194 mmxdatafield mmx_mask_2;
195 mmxdatafield mmx_mask_3;
197 mmxdatafield mmx_full_alpha;
198 mmxdatafield mmx_4x0101;
199 mmxdatafield mmx_ff000000;
202 #if defined(_MSC_VER)
203 # define MMXDATA_INIT(field, val) { val ## UI64 }
204 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
205 # define MMXDATA_INIT(field, val) field = { val ## ULL }
206 #else /* mmxdatafield is an integral type */
207 # define MMXDATA_INIT(field, val) field = val ## ULL
210 static const mmx_data_t c =
212 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
213 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
214 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
215 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
216 MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
217 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
218 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
219 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
220 MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
221 MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
222 MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
223 MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
224 MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
225 #ifndef USE_LOONGSON_MMI
226 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
227 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
228 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
229 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
231 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
232 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
233 MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
236 #ifdef USE_CVT_INTRINSICS
237 # define MC(x) to_m64 (c.mmx_ ## x)
238 #elif defined(USE_M64_CASTS)
239 # define MC(x) ((__m64)c.mmx_ ## x)
240 #elif defined(USE_M64_DOUBLE)
241 # define MC(x) (*(__m64 *)&c.mmx_ ## x)
243 # define MC(x) c.mmx_ ## x
246 static force_inline __m64
249 #ifdef USE_CVT_INTRINSICS
250 return _mm_cvtsi64_m64 (x);
251 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
256 #elif defined USE_M64_DOUBLE
258 #else /* USE_M64_CASTS */
263 static force_inline uint64_t
266 #ifdef USE_CVT_INTRINSICS
267 return _mm_cvtm64_si64 (x);
268 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
269 uint64_t res = x.M64_MEMBER;
271 #elif defined USE_M64_DOUBLE
272 return *(uint64_t *)&x;
273 #else /* USE_M64_CASTS */
278 static force_inline __m64
283 return _mm_slli_si64 (v, s);
285 return _mm_srli_si64 (v, -s);
290 static force_inline __m64
293 return _mm_xor_si64 (mask, MC (4x00ff));
296 static force_inline __m64
297 pix_multiply (__m64 a, __m64 b)
301 res = _mm_mullo_pi16 (a, b);
302 res = _mm_adds_pu16 (res, MC (4x0080));
303 res = _mm_mulhi_pu16 (res, MC (4x0101));
308 static force_inline __m64
309 pix_add (__m64 a, __m64 b)
311 return _mm_adds_pu8 (a, b);
314 static force_inline __m64
315 expand_alpha (__m64 pixel)
317 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
320 static force_inline __m64
321 expand_alpha_rev (__m64 pixel)
323 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
326 static force_inline __m64
327 invert_colors (__m64 pixel)
329 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
332 static force_inline __m64
337 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
340 static force_inline __m64
341 over_rev_non_pre (__m64 src, __m64 dest)
343 __m64 srca = expand_alpha (src);
344 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
346 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
349 static force_inline __m64
350 in (__m64 src, __m64 mask)
352 return pix_multiply (src, mask);
356 static force_inline __m64
357 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
359 return over (in (src, mask), pix_multiply (srca, mask), dest);
364 #define in_over(src, srca, mask, dest) \
365 over (in (src, mask), pix_multiply (srca, mask), dest)
369 /* Elemental unaligned loads */
371 static force_inline __m64 ldq_u(__m64 *p)
374 /* x86's alignment restrictions are very relaxed. */
376 #elif defined USE_ARM_IWMMXT
377 int align = (uintptr_t)p & 7;
381 aligned_p = (__m64 *)((uintptr_t)p & ~7);
382 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
384 struct __una_u64 { __m64 x __attribute__((packed)); };
385 const struct __una_u64 *ptr = (const struct __una_u64 *) p;
386 return (__m64) ptr->x;
390 static force_inline uint32_t ldl_u(const uint32_t *p)
393 /* x86's alignment restrictions are very relaxed. */
396 struct __una_u32 { uint32_t x __attribute__((packed)); };
397 const struct __una_u32 *ptr = (const struct __una_u32 *) p;
402 static force_inline __m64
403 load (const uint32_t *v)
405 #ifdef USE_LOONGSON_MMI
407 asm ("lwc1 %0, %1\n\t"
413 return _mm_cvtsi32_si64 (*v);
417 static force_inline __m64
418 load8888 (const uint32_t *v)
420 #ifdef USE_LOONGSON_MMI
421 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
423 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
427 static force_inline __m64
428 load8888u (const uint32_t *v)
430 uint32_t l = ldl_u (v);
431 return load8888 (&l);
434 static force_inline __m64
435 pack8888 (__m64 lo, __m64 hi)
437 return _mm_packs_pu16 (lo, hi);
440 static force_inline void
441 store (uint32_t *dest, __m64 v)
443 #ifdef USE_LOONGSON_MMI
444 asm ("swc1 %1, %0\n\t"
450 *dest = _mm_cvtsi64_si32 (v);
454 static force_inline void
455 store8888 (uint32_t *dest, __m64 v)
457 v = pack8888 (v, _mm_setzero_si64 ());
461 static force_inline pixman_bool_t
462 is_equal (__m64 a, __m64 b)
464 #ifdef USE_LOONGSON_MMI
465 /* __m64 is double, we can compare directly. */
468 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
472 static force_inline pixman_bool_t
475 #ifdef USE_LOONGSON_MMI
476 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
478 __m64 ffs = _mm_cmpeq_pi8 (v, v);
479 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
483 static force_inline pixman_bool_t
486 return is_equal (v, _mm_setzero_si64 ());
489 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
493 * --- Expanding 565 in the low word ---
495 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
496 * m = m & (01f0003f001f);
497 * m = m * (008404100840);
500 * Note the trick here - the top word is shifted by another nibble to
501 * avoid it bumping into the middle word
503 static force_inline __m64
504 expand565 (__m64 pixel, int pos)
509 /* move pixel to low 16 bit and zero the rest */
510 #ifdef USE_LOONGSON_MMI
511 p = loongson_extract_pi16 (p, pos);
513 p = shift (shift (p, (3 - pos) * 16), -48);
516 t1 = shift (p, 36 - 11);
517 t2 = shift (p, 16 - 5);
519 p = _mm_or_si64 (t1, p);
520 p = _mm_or_si64 (t2, p);
521 p = _mm_and_si64 (p, MC (565_rgb));
523 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
524 return _mm_srli_pi16 (pixel, 8);
527 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
531 static force_inline void
532 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
534 __m64 t0, t1, alpha = _mm_setzero_si64 ();;
535 __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
536 __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
537 __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
539 alpha = _mm_cmpeq_pi32 (alpha, alpha);
541 /* Replicate high bits into empty low bits. */
542 r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
543 g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
544 b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
546 r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
547 g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
548 b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
550 t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
551 t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
553 *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */
554 *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */
557 static force_inline __m64
558 expand8888 (__m64 in, int pos)
561 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
563 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
566 static force_inline __m64
567 expandx888 (__m64 in, int pos)
569 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
572 static force_inline void
573 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
576 expand_4xpacked565 (vin, &v0, &v1, full_alpha);
577 *vout0 = expand8888 (v0, 0);
578 *vout1 = expand8888 (v0, 1);
579 *vout2 = expand8888 (v1, 0);
580 *vout3 = expand8888 (v1, 1);
583 static force_inline __m64
584 pack_565 (__m64 pixel, __m64 target, int pos)
590 r = _mm_and_si64 (p, MC (565_r));
591 g = _mm_and_si64 (p, MC (565_g));
592 b = _mm_and_si64 (p, MC (565_b));
594 #ifdef USE_LOONGSON_MMI
595 r = shift (r, -(32 - 8));
596 g = shift (g, -(16 - 3));
597 b = shift (b, -(0 + 3));
599 p = _mm_or_si64 (r, g);
600 p = _mm_or_si64 (p, b);
601 return loongson_insert_pi16 (t, p, pos);
603 r = shift (r, -(32 - 8) + pos * 16);
604 g = shift (g, -(16 - 3) + pos * 16);
605 b = shift (b, -(0 + 3) + pos * 16);
608 t = _mm_and_si64 (t, MC (mask_0));
610 t = _mm_and_si64 (t, MC (mask_1));
612 t = _mm_and_si64 (t, MC (mask_2));
614 t = _mm_and_si64 (t, MC (mask_3));
616 p = _mm_or_si64 (r, t);
617 p = _mm_or_si64 (g, p);
619 return _mm_or_si64 (b, p);
623 static force_inline __m64
624 pack_4xpacked565 (__m64 a, __m64 b)
626 __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
627 __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
629 __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
630 __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
632 __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
633 __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
635 t0 = _mm_or_si64 (t0, g0);
636 t1 = _mm_or_si64 (t1, g1);
639 #ifdef USE_ARM_IWMMXT
641 return _mm_packs_pu32 (t0, t1);
643 t1 = shift(t1, -5 + 16);
644 return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
650 static force_inline __m64
651 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
653 return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
656 static force_inline __m64
657 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
659 x = pix_multiply (x, a);
660 y = pix_multiply (y, b);
662 return pix_add (x, y);
667 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
669 #define pack_4x565(v0, v1, v2, v3) \
670 pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
672 #define pix_add_mul(x, a, y, b) \
673 ( x = pix_multiply (x, a), \
674 y = pix_multiply (y, b), \
679 /* --------------- MMX code patch for fbcompose.c --------------------- */
681 static force_inline __m64
682 combine (const uint32_t *src, const uint32_t *mask)
684 __m64 vsrc = load8888 (src);
688 __m64 m = load8888 (mask);
690 m = expand_alpha (m);
691 vsrc = pix_multiply (vsrc, m);
698 mmx_combine_over_u (pixman_implementation_t *imp,
701 const uint32_t * src,
702 const uint32_t * mask,
705 const uint32_t *end = dest + width;
709 __m64 vsrc = combine (src, mask);
711 if (is_opaque (vsrc))
713 store8888 (dest, vsrc);
715 else if (!is_zero (vsrc))
717 __m64 sa = expand_alpha (vsrc);
718 store8888 (dest, over (vsrc, sa, load8888 (dest)));
730 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
733 const uint32_t * src,
734 const uint32_t * mask,
737 const uint32_t *end = dest + width;
742 __m64 s = combine (src, mask);
745 da = expand_alpha (d);
746 store8888 (dest, over (d, da, s));
757 mmx_combine_in_u (pixman_implementation_t *imp,
760 const uint32_t * src,
761 const uint32_t * mask,
764 const uint32_t *end = dest + width;
769 __m64 x = combine (src, mask);
772 a = expand_alpha (a);
773 x = pix_multiply (x, a);
786 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
789 const uint32_t * src,
790 const uint32_t * mask,
793 const uint32_t *end = dest + width;
797 __m64 a = combine (src, mask);
801 a = expand_alpha (a);
802 x = pix_multiply (x, a);
814 mmx_combine_out_u (pixman_implementation_t *imp,
817 const uint32_t * src,
818 const uint32_t * mask,
821 const uint32_t *end = dest + width;
826 __m64 x = combine (src, mask);
829 a = expand_alpha (a);
831 x = pix_multiply (x, a);
843 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
846 const uint32_t * src,
847 const uint32_t * mask,
850 const uint32_t *end = dest + width;
854 __m64 a = combine (src, mask);
858 a = expand_alpha (a);
860 x = pix_multiply (x, a);
873 mmx_combine_atop_u (pixman_implementation_t *imp,
876 const uint32_t * src,
877 const uint32_t * mask,
880 const uint32_t *end = dest + width;
885 __m64 s = combine (src, mask);
888 sia = expand_alpha (s);
890 da = expand_alpha (d);
891 s = pix_add_mul (s, da, d, sia);
903 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
906 const uint32_t * src,
907 const uint32_t * mask,
917 __m64 s = combine (src, mask);
920 sa = expand_alpha (s);
921 dia = expand_alpha (d);
923 s = pix_add_mul (s, dia, d, sa);
935 mmx_combine_xor_u (pixman_implementation_t *imp,
938 const uint32_t * src,
939 const uint32_t * mask,
942 const uint32_t *end = dest + width;
947 __m64 s = combine (src, mask);
950 sia = expand_alpha (s);
951 dia = expand_alpha (d);
954 s = pix_add_mul (s, dia, d, sia);
966 mmx_combine_add_u (pixman_implementation_t *imp,
969 const uint32_t * src,
970 const uint32_t * mask,
973 const uint32_t *end = dest + width;
978 __m64 s = combine (src, mask);
993 mmx_combine_saturate_u (pixman_implementation_t *imp,
996 const uint32_t * src,
997 const uint32_t * mask,
1000 const uint32_t *end = dest + width;
1006 __m64 ms = combine (src, mask);
1007 __m64 md = load8888 (dest);
1015 uint32_t quot = DIV_UN8 (da, sa) << 24;
1016 __m64 msa = load8888 (");
1017 msa = expand_alpha (msa);
1018 ms = pix_multiply (ms, msa);
1021 md = pix_add (md, ms);
1022 store8888 (dest, md);
1033 mmx_combine_src_ca (pixman_implementation_t *imp,
1036 const uint32_t * src,
1037 const uint32_t * mask,
1040 const uint32_t *end = src + width;
1044 __m64 a = load8888 (mask);
1045 __m64 s = load8888 (src);
1047 s = pix_multiply (s, a);
1048 store8888 (dest, s);
1058 mmx_combine_over_ca (pixman_implementation_t *imp,
1061 const uint32_t * src,
1062 const uint32_t * mask,
1065 const uint32_t *end = src + width;
1069 __m64 a = load8888 (mask);
1070 __m64 s = load8888 (src);
1071 __m64 d = load8888 (dest);
1072 __m64 sa = expand_alpha (s);
1074 store8888 (dest, in_over (s, sa, a, d));
1084 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1087 const uint32_t * src,
1088 const uint32_t * mask,
1091 const uint32_t *end = src + width;
1095 __m64 a = load8888 (mask);
1096 __m64 s = load8888 (src);
1097 __m64 d = load8888 (dest);
1098 __m64 da = expand_alpha (d);
1100 store8888 (dest, over (d, da, in (s, a)));
1110 mmx_combine_in_ca (pixman_implementation_t *imp,
1113 const uint32_t * src,
1114 const uint32_t * mask,
1117 const uint32_t *end = src + width;
1121 __m64 a = load8888 (mask);
1122 __m64 s = load8888 (src);
1123 __m64 d = load8888 (dest);
1124 __m64 da = expand_alpha (d);
1126 s = pix_multiply (s, a);
1127 s = pix_multiply (s, da);
1128 store8888 (dest, s);
1138 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1141 const uint32_t * src,
1142 const uint32_t * mask,
1145 const uint32_t *end = src + width;
1149 __m64 a = load8888 (mask);
1150 __m64 s = load8888 (src);
1151 __m64 d = load8888 (dest);
1152 __m64 sa = expand_alpha (s);
1154 a = pix_multiply (a, sa);
1155 d = pix_multiply (d, a);
1156 store8888 (dest, d);
1166 mmx_combine_out_ca (pixman_implementation_t *imp,
1169 const uint32_t * src,
1170 const uint32_t * mask,
1173 const uint32_t *end = src + width;
1177 __m64 a = load8888 (mask);
1178 __m64 s = load8888 (src);
1179 __m64 d = load8888 (dest);
1180 __m64 da = expand_alpha (d);
1183 s = pix_multiply (s, a);
1184 s = pix_multiply (s, da);
1185 store8888 (dest, s);
1195 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1198 const uint32_t * src,
1199 const uint32_t * mask,
1202 const uint32_t *end = src + width;
1206 __m64 a = load8888 (mask);
1207 __m64 s = load8888 (src);
1208 __m64 d = load8888 (dest);
1209 __m64 sa = expand_alpha (s);
1211 a = pix_multiply (a, sa);
1213 d = pix_multiply (d, a);
1214 store8888 (dest, d);
1224 mmx_combine_atop_ca (pixman_implementation_t *imp,
1227 const uint32_t * src,
1228 const uint32_t * mask,
1231 const uint32_t *end = src + width;
1235 __m64 a = load8888 (mask);
1236 __m64 s = load8888 (src);
1237 __m64 d = load8888 (dest);
1238 __m64 da = expand_alpha (d);
1239 __m64 sa = expand_alpha (s);
1241 s = pix_multiply (s, a);
1242 a = pix_multiply (a, sa);
1244 d = pix_add_mul (d, a, s, da);
1245 store8888 (dest, d);
1255 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1258 const uint32_t * src,
1259 const uint32_t * mask,
1262 const uint32_t *end = src + width;
1266 __m64 a = load8888 (mask);
1267 __m64 s = load8888 (src);
1268 __m64 d = load8888 (dest);
1269 __m64 da = expand_alpha (d);
1270 __m64 sa = expand_alpha (s);
1272 s = pix_multiply (s, a);
1273 a = pix_multiply (a, sa);
1275 d = pix_add_mul (d, a, s, da);
1276 store8888 (dest, d);
1286 mmx_combine_xor_ca (pixman_implementation_t *imp,
1289 const uint32_t * src,
1290 const uint32_t * mask,
1293 const uint32_t *end = src + width;
1297 __m64 a = load8888 (mask);
1298 __m64 s = load8888 (src);
1299 __m64 d = load8888 (dest);
1300 __m64 da = expand_alpha (d);
1301 __m64 sa = expand_alpha (s);
1303 s = pix_multiply (s, a);
1304 a = pix_multiply (a, sa);
1307 d = pix_add_mul (d, a, s, da);
1308 store8888 (dest, d);
1318 mmx_combine_add_ca (pixman_implementation_t *imp,
1321 const uint32_t * src,
1322 const uint32_t * mask,
1325 const uint32_t *end = src + width;
1329 __m64 a = load8888 (mask);
1330 __m64 s = load8888 (src);
1331 __m64 d = load8888 (dest);
1333 s = pix_multiply (s, a);
1335 store8888 (dest, d);
1344 /* ------------- MMX code paths called from fbpict.c -------------------- */
1347 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1348 pixman_composite_info_t *info)
1350 PIXMAN_COMPOSITE_ARGS (info);
1352 uint32_t *dst_line, *dst;
1359 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1364 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1366 vsrc = load8888 (&src);
1367 vsrca = expand_alpha (vsrc);
1372 dst_line += dst_stride;
1377 while (w && (unsigned long)dst & 7)
1379 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1390 vdest = *(__m64 *)dst;
1392 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1393 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1395 *(__m64 *)dst = pack8888 (dest0, dest1);
1405 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1413 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1414 pixman_composite_info_t *info)
1416 PIXMAN_COMPOSITE_ARGS (info);
1418 uint16_t *dst_line, *dst;
1425 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1430 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1432 vsrc = load8888 (&src);
1433 vsrca = expand_alpha (vsrc);
1438 dst_line += dst_stride;
1443 while (w && (unsigned long)dst & 7)
1446 __m64 vdest = expand565 (to_m64 (d), 0);
1448 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1449 *dst = to_uint64 (vdest);
1457 __m64 vdest = *(__m64 *)dst;
1458 __m64 v0, v1, v2, v3;
1460 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1462 v0 = over (vsrc, vsrca, v0);
1463 v1 = over (vsrc, vsrca, v1);
1464 v2 = over (vsrc, vsrca, v2);
1465 v3 = over (vsrc, vsrca, v3);
1467 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1478 __m64 vdest = expand565 (to_m64 (d), 0);
1480 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1481 *dst = to_uint64 (vdest);
1492 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1493 pixman_composite_info_t *info)
1495 PIXMAN_COMPOSITE_ARGS (info);
1498 uint32_t *mask_line;
1499 int dst_stride, mask_stride;
1504 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1509 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1510 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1512 vsrc = load8888 (&src);
1513 vsrca = expand_alpha (vsrc);
1518 uint32_t *p = (uint32_t *)mask_line;
1519 uint32_t *q = (uint32_t *)dst_line;
1521 while (twidth && (unsigned long)q & 7)
1523 uint32_t m = *(uint32_t *)p;
1527 __m64 vdest = load8888 (q);
1528 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1529 store8888 (q, vdest);
1546 __m64 vdest = *(__m64 *)q;
1548 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1549 expand8888 (vdest, 0));
1550 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1551 expand8888 (vdest, 1));
1553 *(__m64 *)q = pack8888 (dest0, dest1);
1563 uint32_t m = *(uint32_t *)p;
1567 __m64 vdest = load8888 (q);
1568 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1569 store8888 (q, vdest);
1577 dst_line += dst_stride;
1578 mask_line += mask_stride;
1585 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1586 pixman_composite_info_t *info)
1588 PIXMAN_COMPOSITE_ARGS (info);
1589 uint32_t *dst_line, *dst;
1590 uint32_t *src_line, *src;
1593 int dst_stride, src_stride;
1598 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1599 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1601 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1603 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1604 vmask = load8888 (&mask);
1609 dst_line += dst_stride;
1611 src_line += src_stride;
1614 while (w && (unsigned long)dst & 7)
1616 __m64 s = load8888 (src);
1617 __m64 d = load8888 (dst);
1619 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1628 __m64 vs = ldq_u ((__m64 *)src);
1629 __m64 vd = *(__m64 *)dst;
1630 __m64 vsrc0 = expand8888 (vs, 0);
1631 __m64 vsrc1 = expand8888 (vs, 1);
1633 *(__m64 *)dst = pack8888 (
1634 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1635 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1644 __m64 s = load8888 (src);
1645 __m64 d = load8888 (dst);
1647 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1655 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1656 pixman_composite_info_t *info)
1658 PIXMAN_COMPOSITE_ARGS (info);
1659 uint32_t *dst_line, *dst;
1660 uint32_t *src_line, *src;
1663 int dst_stride, src_stride;
1669 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1670 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1671 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1674 mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1675 vmask = load8888 (&mask);
1681 dst_line += dst_stride;
1683 src_line += src_stride;
1686 while (w && (unsigned long)dst & 7)
1688 uint32_t ssrc = *src | 0xff000000;
1689 __m64 s = load8888 (&ssrc);
1690 __m64 d = load8888 (dst);
1692 store8888 (dst, in_over (s, srca, vmask, d));
1701 __m64 vd0 = *(__m64 *)(dst + 0);
1702 __m64 vd1 = *(__m64 *)(dst + 2);
1703 __m64 vd2 = *(__m64 *)(dst + 4);
1704 __m64 vd3 = *(__m64 *)(dst + 6);
1705 __m64 vd4 = *(__m64 *)(dst + 8);
1706 __m64 vd5 = *(__m64 *)(dst + 10);
1707 __m64 vd6 = *(__m64 *)(dst + 12);
1708 __m64 vd7 = *(__m64 *)(dst + 14);
1710 __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1711 __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1712 __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1713 __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1714 __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1715 __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1716 __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1717 __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1720 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1721 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1724 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1725 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1728 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1729 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1732 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1733 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1736 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1737 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1740 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1741 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1744 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1745 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1748 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1749 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1751 *(__m64 *)(dst + 0) = vd0;
1752 *(__m64 *)(dst + 2) = vd1;
1753 *(__m64 *)(dst + 4) = vd2;
1754 *(__m64 *)(dst + 6) = vd3;
1755 *(__m64 *)(dst + 8) = vd4;
1756 *(__m64 *)(dst + 10) = vd5;
1757 *(__m64 *)(dst + 12) = vd6;
1758 *(__m64 *)(dst + 14) = vd7;
1767 uint32_t ssrc = *src | 0xff000000;
1768 __m64 s = load8888 (&ssrc);
1769 __m64 d = load8888 (dst);
1771 store8888 (dst, in_over (s, srca, vmask, d));
1783 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1784 pixman_composite_info_t *info)
1786 PIXMAN_COMPOSITE_ARGS (info);
1787 uint32_t *dst_line, *dst;
1788 uint32_t *src_line, *src;
1790 int dst_stride, src_stride;
1796 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1797 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1802 dst_line += dst_stride;
1804 src_line += src_stride;
1820 sa = expand_alpha (ms);
1821 store8888 (dst, over (ms, sa, load8888 (dst)));
1831 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1832 pixman_composite_info_t *info)
1834 PIXMAN_COMPOSITE_ARGS (info);
1835 uint16_t *dst_line, *dst;
1836 uint32_t *src_line, *src;
1837 int dst_stride, src_stride;
1842 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1843 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1847 assert (src_image->drawable == mask_image->drawable);
1853 dst_line += dst_stride;
1855 src_line += src_stride;
1860 while (w && (unsigned long)dst & 7)
1862 __m64 vsrc = load8888 (src);
1864 __m64 vdest = expand565 (to_m64 (d), 0);
1867 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1869 *dst = to_uint64 (vdest);
1880 __m64 vdest = *(__m64 *)dst;
1881 __m64 v0, v1, v2, v3;
1883 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1885 __m64 vsrc0 = load8888 ((src + 0));
1886 __m64 vsrc1 = load8888 ((src + 1));
1887 __m64 vsrc2 = load8888 ((src + 2));
1888 __m64 vsrc3 = load8888 ((src + 3));
1890 v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1891 v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1892 v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1893 v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1895 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1906 __m64 vsrc = load8888 (src);
1908 __m64 vdest = expand565 (to_m64 (d), 0);
1910 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1912 *dst = to_uint64 (vdest);
1924 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1925 pixman_composite_info_t *info)
1927 PIXMAN_COMPOSITE_ARGS (info);
1929 uint32_t *dst_line, *dst;
1930 uint8_t *mask_line, *mask;
1931 int dst_stride, mask_stride;
1938 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1944 srcsrc = (uint64_t)src << 32 | src;
1946 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1947 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1949 vsrc = load8888 (&src);
1950 vsrca = expand_alpha (vsrc);
1955 dst_line += dst_stride;
1957 mask_line += mask_stride;
1962 while (w && (unsigned long)dst & 7)
1968 __m64 vdest = in_over (vsrc, vsrca,
1969 expand_alpha_rev (to_m64 (m)),
1972 store8888 (dst, vdest);
1989 if (srca == 0xff && (m0 & m1) == 0xff)
1991 *(uint64_t *)dst = srcsrc;
1998 vdest = *(__m64 *)dst;
2000 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2001 expand8888 (vdest, 0));
2002 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2003 expand8888 (vdest, 1));
2005 *(__m64 *)dst = pack8888 (dest0, dest1);
2021 __m64 vdest = load8888 (dst);
2024 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2025 store8888 (dst, vdest);
2034 pixman_fill_mmx (uint32_t *bits,
2045 uint32_t byte_width;
2048 #if defined __GNUC__ && defined USE_X86_MMX
2049 __m64 v1, v2, v3, v4, v5, v6, v7;
2052 if (bpp != 16 && bpp != 32 && bpp != 8)
2057 stride = stride * (int) sizeof (uint32_t) / 1;
2058 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2061 xor = (xor & 0xff) * 0x01010101;
2065 stride = stride * (int) sizeof (uint32_t) / 2;
2066 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2067 byte_width = 2 * width;
2069 xor = (xor & 0xffff) * 0x00010001;
2073 stride = stride * (int) sizeof (uint32_t) / 4;
2074 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2075 byte_width = 4 * width;
2079 fill = ((uint64_t)xor << 32) | xor;
2080 vfill = to_m64 (fill);
2082 #if defined __GNUC__ && defined USE_X86_MMX
2091 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2092 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2099 uint8_t *d = byte_line;
2101 byte_line += stride;
2104 if (w >= 1 && ((unsigned long)d & 1))
2106 *(uint8_t *)d = (xor & 0xff);
2111 if (w >= 2 && ((unsigned long)d & 3))
2113 *(uint16_t *)d = xor;
2118 while (w >= 4 && ((unsigned long)d & 7))
2120 *(uint32_t *)d = xor;
2128 #if defined __GNUC__ && defined USE_X86_MMX
2140 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2141 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2144 *(__m64*) (d + 0) = vfill;
2145 *(__m64*) (d + 8) = vfill;
2146 *(__m64*) (d + 16) = vfill;
2147 *(__m64*) (d + 24) = vfill;
2148 *(__m64*) (d + 32) = vfill;
2149 *(__m64*) (d + 40) = vfill;
2150 *(__m64*) (d + 48) = vfill;
2151 *(__m64*) (d + 56) = vfill;
2159 *(uint32_t *)d = xor;
2166 *(uint16_t *)d = xor;
2172 *(uint8_t *)d = (xor & 0xff);
2184 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2185 pixman_composite_info_t *info)
2187 PIXMAN_COMPOSITE_ARGS (info);
2188 uint16_t *dst_line, *dst;
2189 uint32_t *src_line, *src, s;
2190 int dst_stride, src_stride;
2193 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2194 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2199 dst_line += dst_stride;
2201 src_line += src_stride;
2204 while (w && (unsigned long)dst & 7)
2207 *dst = CONVERT_8888_TO_0565 (s);
2215 __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2216 __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2218 vdest = pack_4xpacked565 (vsrc0, vsrc1);
2220 *(__m64 *)dst = vdest;
2230 *dst = CONVERT_8888_TO_0565 (s);
2238 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2239 pixman_composite_info_t *info)
2241 PIXMAN_COMPOSITE_ARGS (info);
2243 uint32_t *dst_line, *dst;
2244 uint8_t *mask_line, *mask;
2245 int dst_stride, mask_stride;
2252 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2257 pixman_fill_mmx (dest_image->bits.bits, dest_image->bits.rowstride,
2258 PIXMAN_FORMAT_BPP (dest_image->bits.format),
2259 dest_x, dest_y, width, height, 0);
2263 srcsrc = (uint64_t)src << 32 | src;
2265 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2266 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2268 vsrc = load8888 (&src);
2273 dst_line += dst_stride;
2275 mask_line += mask_stride;
2280 while (w && (unsigned long)dst & 7)
2286 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2288 store8888 (dst, vdest);
2308 if (srca == 0xff && (m0 & m1) == 0xff)
2310 *(uint64_t *)dst = srcsrc;
2316 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2317 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2319 *(__m64 *)dst = pack8888 (dest0, dest1);
2323 *(uint64_t *)dst = 0;
2339 __m64 vdest = load8888 (dst);
2341 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2342 store8888 (dst, vdest);
2355 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2356 pixman_composite_info_t *info)
2358 PIXMAN_COMPOSITE_ARGS (info);
2360 uint16_t *dst_line, *dst;
2361 uint8_t *mask_line, *mask;
2362 int dst_stride, mask_stride;
2364 __m64 vsrc, vsrca, tmp;
2369 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2375 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2376 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2378 vsrc = load8888 (&src);
2379 vsrca = expand_alpha (vsrc);
2381 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2382 srcsrcsrcsrc = expand_alpha_rev (tmp);
2387 dst_line += dst_stride;
2389 mask_line += mask_stride;
2394 while (w && (unsigned long)dst & 7)
2401 __m64 vd = to_m64 (d);
2402 __m64 vdest = in_over (
2403 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2405 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2406 *dst = to_uint64 (vd);
2418 uint64_t m0, m1, m2, m3;
2424 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2426 *(__m64 *)dst = srcsrcsrcsrc;
2428 else if (m0 | m1 | m2 | m3)
2430 __m64 vdest = *(__m64 *)dst;
2431 __m64 v0, v1, v2, v3;
2433 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2435 __m64 vm0 = to_m64 (m0);
2436 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2438 __m64 vm1 = to_m64 (m1);
2439 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2441 __m64 vm2 = to_m64 (m2);
2442 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2444 __m64 vm3 = to_m64 (m3);
2445 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2447 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2464 __m64 vd = to_m64 (d);
2465 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2467 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2468 *dst = to_uint64 (vd);
2481 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2482 pixman_composite_info_t *info)
2484 PIXMAN_COMPOSITE_ARGS (info);
2485 uint16_t *dst_line, *dst;
2486 uint32_t *src_line, *src;
2487 int dst_stride, src_stride;
2492 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2493 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2497 assert (src_image->drawable == mask_image->drawable);
2503 dst_line += dst_stride;
2505 src_line += src_stride;
2510 while (w && (unsigned long)dst & 7)
2512 __m64 vsrc = load8888 (src);
2514 __m64 vdest = expand565 (to_m64 (d), 0);
2516 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2518 *dst = to_uint64 (vdest);
2529 uint32_t s0, s1, s2, s3;
2530 unsigned char a0, a1, a2, a3;
2542 if ((a0 & a1 & a2 & a3) == 0xFF)
2544 __m64 v0 = invert_colors (load8888 (&s0));
2545 __m64 v1 = invert_colors (load8888 (&s1));
2546 __m64 v2 = invert_colors (load8888 (&s2));
2547 __m64 v3 = invert_colors (load8888 (&s3));
2549 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2551 else if (s0 | s1 | s2 | s3)
2553 __m64 vdest = *(__m64 *)dst;
2554 __m64 v0, v1, v2, v3;
2556 __m64 vsrc0 = load8888 (&s0);
2557 __m64 vsrc1 = load8888 (&s1);
2558 __m64 vsrc2 = load8888 (&s2);
2559 __m64 vsrc3 = load8888 (&s3);
2561 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2563 v0 = over_rev_non_pre (vsrc0, v0);
2564 v1 = over_rev_non_pre (vsrc1, v1);
2565 v2 = over_rev_non_pre (vsrc2, v2);
2566 v3 = over_rev_non_pre (vsrc3, v3);
2568 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2580 __m64 vsrc = load8888 (src);
2582 __m64 vdest = expand565 (to_m64 (d), 0);
2584 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2586 *dst = to_uint64 (vdest);
2598 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2599 pixman_composite_info_t *info)
2601 PIXMAN_COMPOSITE_ARGS (info);
2602 uint32_t *dst_line, *dst;
2603 uint32_t *src_line, *src;
2604 int dst_stride, src_stride;
2609 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2610 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2614 assert (src_image->drawable == mask_image->drawable);
2620 dst_line += dst_stride;
2622 src_line += src_stride;
2625 while (w && (unsigned long)dst & 7)
2627 __m64 s = load8888 (src);
2628 __m64 d = load8888 (dst);
2630 store8888 (dst, over_rev_non_pre (s, d));
2640 unsigned char a0, a1;
2649 if ((a0 & a1) == 0xFF)
2651 d0 = invert_colors (load8888 (&s0));
2652 d1 = invert_colors (load8888 (&s1));
2654 *(__m64 *)dst = pack8888 (d0, d1);
2658 __m64 vdest = *(__m64 *)dst;
2660 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2661 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2663 *(__m64 *)dst = pack8888 (d0, d1);
2673 __m64 s = load8888 (src);
2674 __m64 d = load8888 (dst);
2676 store8888 (dst, over_rev_non_pre (s, d));
2684 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2685 pixman_composite_info_t *info)
2687 PIXMAN_COMPOSITE_ARGS (info);
2690 uint32_t *mask_line;
2691 int dst_stride, mask_stride;
2696 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2701 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2702 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2704 vsrc = load8888 (&src);
2705 vsrca = expand_alpha (vsrc);
2710 uint32_t *p = (uint32_t *)mask_line;
2711 uint16_t *q = (uint16_t *)dst_line;
2713 while (twidth && ((unsigned long)q & 7))
2715 uint32_t m = *(uint32_t *)p;
2720 __m64 vdest = expand565 (to_m64 (d), 0);
2721 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2722 *q = to_uint64 (vdest);
2732 uint32_t m0, m1, m2, m3;
2739 if ((m0 | m1 | m2 | m3))
2741 __m64 vdest = *(__m64 *)q;
2742 __m64 v0, v1, v2, v3;
2744 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2746 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2747 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2748 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2749 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2751 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2766 __m64 vdest = expand565 (to_m64 (d), 0);
2767 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2768 *q = to_uint64 (vdest);
2776 mask_line += mask_stride;
2777 dst_line += dst_stride;
2784 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2785 pixman_composite_info_t *info)
2787 PIXMAN_COMPOSITE_ARGS (info);
2788 uint8_t *dst_line, *dst;
2789 uint8_t *mask_line, *mask;
2790 int dst_stride, mask_stride;
2796 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2797 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2799 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2803 vsrc = load8888 (&src);
2804 vsrca = expand_alpha (vsrc);
2809 dst_line += dst_stride;
2811 mask_line += mask_stride;
2814 while (w && (unsigned long)dst & 7)
2823 m = MUL_UN8 (sa, a, tmp);
2824 d = MUL_UN8 (m, d, tmp);
2835 vmask = load8888u ((uint32_t *)mask);
2836 vdest = load8888 ((uint32_t *)dst);
2838 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2854 m = MUL_UN8 (sa, a, tmp);
2855 d = MUL_UN8 (m, d, tmp);
2865 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2866 pixman_composite_info_t *info)
2868 PIXMAN_COMPOSITE_ARGS (info);
2869 uint8_t *dst_line, *dst;
2870 uint8_t *src_line, *src;
2871 int src_stride, dst_stride;
2874 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2875 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2880 dst_line += dst_stride;
2882 src_line += src_stride;
2885 while (w && (unsigned long)dst & 3)
2893 *dst = MUL_UN8 (s, d, tmp);
2902 uint32_t *s = (uint32_t *)src;
2903 uint32_t *d = (uint32_t *)dst;
2905 store8888 (d, in (load8888u (s), load8888 (d)));
2920 *dst = MUL_UN8 (s, d, tmp);
2931 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2932 pixman_composite_info_t *info)
2934 PIXMAN_COMPOSITE_ARGS (info);
2935 uint8_t *dst_line, *dst;
2936 uint8_t *mask_line, *mask;
2937 int dst_stride, mask_stride;
2943 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2944 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2946 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2953 vsrc = load8888 (&src);
2954 vsrca = expand_alpha (vsrc);
2959 dst_line += dst_stride;
2961 mask_line += mask_stride;
2964 while (w && (unsigned long)dst & 3)
2974 m = MUL_UN8 (sa, a, tmp);
2975 r = ADD_UN8 (m, d, tmp);
2986 vmask = load8888u ((uint32_t *)mask);
2987 vdest = load8888 ((uint32_t *)dst);
2989 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3006 m = MUL_UN8 (sa, a, tmp);
3007 r = ADD_UN8 (m, d, tmp);
3017 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3018 pixman_composite_info_t *info)
3020 PIXMAN_COMPOSITE_ARGS (info);
3021 uint8_t *dst_line, *dst;
3022 uint8_t *src_line, *src;
3023 int dst_stride, src_stride;
3030 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3031 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3036 dst_line += dst_stride;
3038 src_line += src_stride;
3041 while (w && (unsigned long)dst & 7)
3046 s = t | (0 - (t >> 8));
3056 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3067 s = t | (0 - (t >> 8));
3080 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3081 pixman_composite_info_t *info)
3083 PIXMAN_COMPOSITE_ARGS (info);
3084 uint16_t *dst_line, *dst;
3086 uint16_t *src_line, *src;
3088 int dst_stride, src_stride;
3093 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3094 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3099 dst_line += dst_stride;
3101 src_line += src_stride;
3104 while (w && (unsigned long)dst & 7)
3110 s = CONVERT_0565_TO_8888 (s);
3113 d = CONVERT_0565_TO_8888 (d);
3114 UN8x4_ADD_UN8x4 (s, d);
3116 *dst = CONVERT_8888_TO_0565 (s);
3124 __m64 vdest = *(__m64 *)dst;
3125 __m64 vsrc = ldq_u ((__m64 *)src);
3129 expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3130 expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3132 vd0 = _mm_adds_pu8 (vd0, vs0);
3133 vd1 = _mm_adds_pu8 (vd1, vs1);
3135 *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3148 s = CONVERT_0565_TO_8888 (s);
3151 d = CONVERT_0565_TO_8888 (d);
3152 UN8x4_ADD_UN8x4 (s, d);
3154 *dst = CONVERT_8888_TO_0565 (s);
3164 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3165 pixman_composite_info_t *info)
3167 PIXMAN_COMPOSITE_ARGS (info);
3168 uint32_t *dst_line, *dst;
3169 uint32_t *src_line, *src;
3170 int dst_stride, src_stride;
3175 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3176 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3181 dst_line += dst_stride;
3183 src_line += src_stride;
3186 while (w && (unsigned long)dst & 7)
3188 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3189 load ((const uint32_t *)dst)));
3197 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3205 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3206 load ((const uint32_t *)dst)));
3214 static pixman_bool_t
3215 pixman_blt_mmx (uint32_t *src_bits,
3228 uint8_t * src_bytes;
3229 uint8_t * dst_bytes;
3232 if (src_bpp != dst_bpp)
3237 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3238 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3239 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3240 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3241 byte_width = 2 * width;
3245 else if (src_bpp == 32)
3247 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3248 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3249 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3250 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3251 byte_width = 4 * width;
3263 uint8_t *s = src_bytes;
3264 uint8_t *d = dst_bytes;
3265 src_bytes += src_stride;
3266 dst_bytes += dst_stride;
3269 if (w >= 1 && ((unsigned long)d & 1))
3271 *(uint8_t *)d = *(uint8_t *)s;
3277 if (w >= 2 && ((unsigned long)d & 3))
3279 *(uint16_t *)d = *(uint16_t *)s;
3285 while (w >= 4 && ((unsigned long)d & 7))
3287 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3296 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3298 "movq (%1), %%mm0\n"
3299 "movq 8(%1), %%mm1\n"
3300 "movq 16(%1), %%mm2\n"
3301 "movq 24(%1), %%mm3\n"
3302 "movq 32(%1), %%mm4\n"
3303 "movq 40(%1), %%mm5\n"
3304 "movq 48(%1), %%mm6\n"
3305 "movq 56(%1), %%mm7\n"
3307 "movq %%mm0, (%0)\n"
3308 "movq %%mm1, 8(%0)\n"
3309 "movq %%mm2, 16(%0)\n"
3310 "movq %%mm3, 24(%0)\n"
3311 "movq %%mm4, 32(%0)\n"
3312 "movq %%mm5, 40(%0)\n"
3313 "movq %%mm6, 48(%0)\n"
3314 "movq %%mm7, 56(%0)\n"
3318 "%mm0", "%mm1", "%mm2", "%mm3",
3319 "%mm4", "%mm5", "%mm6", "%mm7");
3321 __m64 v0 = ldq_u ((__m64 *)(s + 0));
3322 __m64 v1 = ldq_u ((__m64 *)(s + 8));
3323 __m64 v2 = ldq_u ((__m64 *)(s + 16));
3324 __m64 v3 = ldq_u ((__m64 *)(s + 24));
3325 __m64 v4 = ldq_u ((__m64 *)(s + 32));
3326 __m64 v5 = ldq_u ((__m64 *)(s + 40));
3327 __m64 v6 = ldq_u ((__m64 *)(s + 48));
3328 __m64 v7 = ldq_u ((__m64 *)(s + 56));
3329 *(__m64 *)(d + 0) = v0;
3330 *(__m64 *)(d + 8) = v1;
3331 *(__m64 *)(d + 16) = v2;
3332 *(__m64 *)(d + 24) = v3;
3333 *(__m64 *)(d + 32) = v4;
3334 *(__m64 *)(d + 40) = v5;
3335 *(__m64 *)(d + 48) = v6;
3336 *(__m64 *)(d + 56) = v7;
3345 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3353 *(uint16_t *)d = *(uint16_t *)s;
3366 mmx_composite_copy_area (pixman_implementation_t *imp,
3367 pixman_composite_info_t *info)
3369 PIXMAN_COMPOSITE_ARGS (info);
3371 pixman_blt_mmx (src_image->bits.bits,
3372 dest_image->bits.bits,
3373 src_image->bits.rowstride,
3374 dest_image->bits.rowstride,
3375 PIXMAN_FORMAT_BPP (src_image->bits.format),
3376 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3377 src_x, src_y, dest_x, dest_y, width, height);
3381 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3382 pixman_composite_info_t *info)
3384 PIXMAN_COMPOSITE_ARGS (info);
3385 uint32_t *src, *src_line;
3386 uint32_t *dst, *dst_line;
3387 uint8_t *mask, *mask_line;
3388 int src_stride, mask_stride, dst_stride;
3391 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3392 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3393 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3398 src_line += src_stride;
3400 dst_line += dst_stride;
3402 mask_line += mask_stride;
3412 uint32_t ssrc = *src | 0xff000000;
3413 __m64 s = load8888 (&ssrc);
3421 __m64 sa = expand_alpha (s);
3422 __m64 vm = expand_alpha_rev (to_m64 (m));
3423 __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3425 store8888 (dst, vdest);
3439 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3441 int w = iter->width;
3442 uint32_t *dst = iter->buffer;
3443 uint32_t *src = (uint32_t *)iter->bits;
3445 iter->bits += iter->stride;
3447 while (w && ((unsigned long)dst) & 7)
3449 *dst++ = (*src++) | 0xff000000;
3455 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3456 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3457 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3458 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3460 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3461 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3462 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3463 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3472 *dst++ = (*src++) | 0xff000000;
3476 return iter->buffer;
3480 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3482 int w = iter->width;
3483 uint32_t *dst = iter->buffer;
3484 uint16_t *src = (uint16_t *)iter->bits;
3486 iter->bits += iter->stride;
3488 while (w && ((unsigned long)dst) & 0x0f)
3490 uint16_t s = *src++;
3492 *dst++ = CONVERT_0565_TO_8888 (s);
3498 __m64 vsrc = ldq_u ((__m64 *)src);
3501 expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3503 *(__m64 *)(dst + 0) = mm0;
3504 *(__m64 *)(dst + 2) = mm1;
3513 uint16_t s = *src++;
3515 *dst++ = CONVERT_0565_TO_8888 (s);
3519 return iter->buffer;
3523 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3525 int w = iter->width;
3526 uint32_t *dst = iter->buffer;
3527 uint8_t *src = iter->bits;
3529 iter->bits += iter->stride;
3531 while (w && (((unsigned long)dst) & 15))
3533 *dst++ = *(src++) << 24;
3539 __m64 mm0 = ldq_u ((__m64 *)src);
3541 __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
3542 __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
3543 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3544 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3545 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3546 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3548 *(__m64 *)(dst + 0) = mm3;
3549 *(__m64 *)(dst + 2) = mm4;
3550 *(__m64 *)(dst + 4) = mm5;
3551 *(__m64 *)(dst + 6) = mm6;
3560 *dst++ = *(src++) << 24;
3564 return iter->buffer;
3569 pixman_format_code_t format;
3570 pixman_iter_get_scanline_t get_scanline;
3573 static const fetcher_info_t fetchers[] =
3575 { PIXMAN_x8r8g8b8, mmx_fetch_x8r8g8b8 },
3576 { PIXMAN_r5g6b5, mmx_fetch_r5g6b5 },
3577 { PIXMAN_a8, mmx_fetch_a8 },
3582 mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
3584 pixman_image_t *image = iter->image;
3587 int width = iter->width;
3588 int height = iter->height;
3591 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE)
3593 if ((iter->flags & ITER_NARROW) &&
3594 (image->common.flags & FLAGS) == FLAGS &&
3596 x + width <= image->bits.width &&
3597 y + height <= image->bits.height)
3599 const fetcher_info_t *f;
3601 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
3603 if (image->common.extended_format_code == f->format)
3605 uint8_t *b = (uint8_t *)image->bits.bits;
3606 int s = image->bits.rowstride * 4;
3608 iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
3611 iter->get_scanline = f->get_scanline;
3617 imp->delegate->src_iter_init (imp->delegate, iter);
3620 static const pixman_fast_path_t mmx_fast_paths[] =
3622 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
3623 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
3624 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
3625 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
3626 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
3627 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
3628 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3629 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3630 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
3631 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3632 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3633 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
3634 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3635 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3636 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
3637 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3638 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3639 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
3640 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
3641 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
3642 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
3643 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
3644 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
3645 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
3646 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
3647 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
3648 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
3649 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3650 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
3651 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
3652 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
3653 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
3654 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3655 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
3656 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3657 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3659 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
3660 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
3661 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
3662 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
3663 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
3664 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
3666 PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
3667 PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
3668 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
3669 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
3670 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
3671 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
3673 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
3674 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
3675 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
3676 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
3677 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
3678 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
3679 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
3680 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
3681 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
3682 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
3683 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3684 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3685 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3686 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3687 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
3688 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
3690 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
3691 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
3696 static pixman_bool_t
3697 mmx_blt (pixman_implementation_t *imp,
3698 uint32_t * src_bits,
3699 uint32_t * dst_bits,
3711 if (!pixman_blt_mmx (
3712 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3713 src_x, src_y, dest_x, dest_y, width, height))
3716 return _pixman_implementation_blt (
3718 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3719 src_x, src_y, dest_x, dest_y, width, height);
3725 static pixman_bool_t
3726 mmx_fill (pixman_implementation_t *imp,
3736 if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
3738 return _pixman_implementation_fill (
3739 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
3745 pixman_implementation_t *
3746 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
3748 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
3750 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
3751 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
3752 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
3753 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
3754 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
3755 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
3756 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
3757 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
3758 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
3759 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
3760 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
3762 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
3763 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
3764 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
3765 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
3766 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
3767 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
3768 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
3769 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
3770 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
3771 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
3772 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
3775 imp->fill = mmx_fill;
3777 imp->src_iter_init = mmx_src_iter_init;
3782 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */