2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38 #include "pixman-fast-path.h"
40 #if defined(_MSC_VER) && defined(_M_AMD64)
41 /* Windows 64 doesn't allow MMX to be used, so
42 * the pixman-x64-mmx-emulation.h file contains
43 * implementations of those MMX intrinsics that
44 * are used in the SSE2 implementation.
46 # include "pixman-x64-mmx-emulation.h"
49 static __m128i mask_0080;
50 static __m128i mask_00ff;
51 static __m128i mask_0101;
52 static __m128i mask_ffff;
53 static __m128i mask_ff000000;
54 static __m128i mask_alpha;
56 static __m128i mask_565_r;
57 static __m128i mask_565_g1, mask_565_g2;
58 static __m128i mask_565_b;
59 static __m128i mask_red;
60 static __m128i mask_green;
61 static __m128i mask_blue;
63 static __m128i mask_565_fix_rb;
64 static __m128i mask_565_fix_g;
66 static force_inline __m128i
67 unpack_32_1x128 (uint32_t data)
69 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
72 static force_inline void
73 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
75 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
76 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
79 static force_inline __m128i
80 unpack_565_to_8888 (__m128i lo)
82 __m128i r, g, b, rb, t;
84 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
85 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
86 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
88 rb = _mm_or_si128 (r, b);
89 t = _mm_and_si128 (rb, mask_565_fix_rb);
90 t = _mm_srli_epi32 (t, 5);
91 rb = _mm_or_si128 (rb, t);
93 t = _mm_and_si128 (g, mask_565_fix_g);
94 t = _mm_srli_epi32 (t, 6);
95 g = _mm_or_si128 (g, t);
97 return _mm_or_si128 (rb, g);
100 static force_inline void
101 unpack_565_128_4x128 (__m128i data,
109 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
110 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
112 lo = unpack_565_to_8888 (lo);
113 hi = unpack_565_to_8888 (hi);
115 unpack_128_2x128 (lo, data0, data1);
116 unpack_128_2x128 (hi, data2, data3);
119 static force_inline uint16_t
120 pack_565_32_16 (uint32_t pixel)
122 return (uint16_t) (((pixel >> 8) & 0xf800) |
123 ((pixel >> 5) & 0x07e0) |
124 ((pixel >> 3) & 0x001f));
127 static force_inline __m128i
128 pack_2x128_128 (__m128i lo, __m128i hi)
130 return _mm_packus_epi16 (lo, hi);
133 static force_inline __m128i
134 pack_565_2x128_128 (__m128i lo, __m128i hi)
137 __m128i r, g1, g2, b;
139 data = pack_2x128_128 (lo, hi);
141 r = _mm_and_si128 (data, mask_565_r);
142 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
143 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
144 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
146 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
149 static force_inline __m128i
150 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
152 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
153 pack_565_2x128_128 (*xmm2, *xmm3));
156 static force_inline int
157 is_opaque (__m128i x)
159 __m128i ffs = _mm_cmpeq_epi8 (x, x);
161 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
164 static force_inline int
167 return _mm_movemask_epi8 (
168 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
171 static force_inline int
172 is_transparent (__m128i x)
174 return (_mm_movemask_epi8 (
175 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
178 static force_inline __m128i
179 expand_pixel_32_1x128 (uint32_t data)
181 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
184 static force_inline __m128i
185 expand_alpha_1x128 (__m128i data)
187 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
188 _MM_SHUFFLE (3, 3, 3, 3)),
189 _MM_SHUFFLE (3, 3, 3, 3));
192 static force_inline void
193 expand_alpha_2x128 (__m128i data_lo,
200 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
201 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
203 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
204 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
207 static force_inline void
208 expand_alpha_rev_2x128 (__m128i data_lo,
215 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
216 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
217 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
218 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
221 static force_inline void
222 pix_multiply_2x128 (__m128i* data_lo,
231 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
232 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
233 lo = _mm_adds_epu16 (lo, mask_0080);
234 hi = _mm_adds_epu16 (hi, mask_0080);
235 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
236 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
239 static force_inline void
240 pix_add_multiply_2x128 (__m128i* src_lo,
242 __m128i* alpha_dst_lo,
243 __m128i* alpha_dst_hi,
246 __m128i* alpha_src_lo,
247 __m128i* alpha_src_hi,
251 __m128i t1_lo, t1_hi;
252 __m128i t2_lo, t2_hi;
254 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
255 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
257 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
258 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
261 static force_inline void
262 negate_2x128 (__m128i data_lo,
267 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
268 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
271 static force_inline void
272 invert_colors_2x128 (__m128i data_lo,
279 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
280 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
281 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
282 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
285 static force_inline void
286 over_2x128 (__m128i* src_lo,
295 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
297 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
299 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
300 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
303 static force_inline void
304 over_rev_non_pre_2x128 (__m128i src_lo,
310 __m128i alpha_lo, alpha_hi;
312 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
314 lo = _mm_or_si128 (alpha_lo, mask_alpha);
315 hi = _mm_or_si128 (alpha_hi, mask_alpha);
317 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
319 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
321 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
324 static force_inline void
325 in_over_2x128 (__m128i* src_lo,
337 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
338 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
340 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
343 /* load 4 pixels from a 16-byte boundary aligned address */
344 static force_inline __m128i
345 load_128_aligned (__m128i* src)
347 return _mm_load_si128 (src);
350 /* load 4 pixels from a unaligned address */
351 static force_inline __m128i
352 load_128_unaligned (const __m128i* src)
354 return _mm_loadu_si128 (src);
357 /* save 4 pixels using Write Combining memory on a 16-byte
358 * boundary aligned address
360 static force_inline void
361 save_128_write_combining (__m128i* dst,
364 _mm_stream_si128 (dst, data);
367 /* save 4 pixels on a 16-byte boundary aligned address */
368 static force_inline void
369 save_128_aligned (__m128i* dst,
372 _mm_store_si128 (dst, data);
375 /* save 4 pixels on a unaligned address */
376 static force_inline void
377 save_128_unaligned (__m128i* dst,
380 _mm_storeu_si128 (dst, data);
383 static force_inline __m128i
384 load_32_1x128 (uint32_t data)
386 return _mm_cvtsi32_si128 (data);
389 static force_inline __m128i
390 expand_alpha_rev_1x128 (__m128i data)
392 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
395 static force_inline __m128i
396 expand_pixel_8_1x128 (uint8_t data)
398 return _mm_shufflelo_epi16 (
399 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
402 static force_inline __m128i
403 pix_multiply_1x128 (__m128i data,
406 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
411 static force_inline __m128i
412 pix_add_multiply_1x128 (__m128i* src,
417 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
418 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
420 return _mm_adds_epu8 (t1, t2);
423 static force_inline __m128i
424 negate_1x128 (__m128i data)
426 return _mm_xor_si128 (data, mask_00ff);
429 static force_inline __m128i
430 invert_colors_1x128 (__m128i data)
432 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
435 static force_inline __m128i
436 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
438 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
441 static force_inline __m128i
442 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
444 return over_1x128 (pix_multiply_1x128 (*src, *mask),
445 pix_multiply_1x128 (*alpha, *mask),
449 static force_inline __m128i
450 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
452 __m128i alpha = expand_alpha_1x128 (src);
454 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
455 _mm_or_si128 (alpha, mask_alpha)),
460 static force_inline uint32_t
461 pack_1x128_32 (__m128i data)
463 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
466 static force_inline __m128i
467 expand565_16_1x128 (uint16_t pixel)
469 __m128i m = _mm_cvtsi32_si128 (pixel);
471 m = unpack_565_to_8888 (m);
473 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
476 static force_inline uint32_t
477 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
490 xmms = unpack_32_1x128 (src);
491 return pack_1x128_32 (
492 over_1x128 (xmms, expand_alpha_1x128 (xmms),
493 unpack_32_1x128 (dst)));
499 static force_inline uint32_t
500 combine1 (const uint32_t *ps, const uint32_t *pm)
508 mm = unpack_32_1x128 (*pm);
509 mm = expand_alpha_1x128 (mm);
511 ms = unpack_32_1x128 (s);
512 ms = pix_multiply_1x128 (ms, mm);
514 s = pack_1x128_32 (ms);
520 static force_inline __m128i
521 combine4 (const __m128i *ps, const __m128i *pm)
523 __m128i xmm_src_lo, xmm_src_hi;
524 __m128i xmm_msk_lo, xmm_msk_hi;
529 xmm_msk_lo = load_128_unaligned (pm);
531 if (is_transparent (xmm_msk_lo))
532 return _mm_setzero_si128 ();
535 s = load_128_unaligned (ps);
539 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
540 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
542 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
544 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
545 &xmm_msk_lo, &xmm_msk_hi,
546 &xmm_src_lo, &xmm_src_hi);
548 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
554 static force_inline void
555 core_combine_over_u_sse2_mask (uint32_t * pd,
562 /* Align dst on a 16-byte boundary */
563 while (w && ((unsigned long)pd & 15))
566 s = combine1 (ps, pm);
569 *pd = core_combine_over_u_pixel_sse2 (s, d);
578 __m128i mask = load_128_unaligned ((__m128i *)pm);
583 __m128i src_hi, src_lo;
584 __m128i mask_hi, mask_lo;
585 __m128i alpha_hi, alpha_lo;
587 src = load_128_unaligned ((__m128i *)ps);
589 if (is_opaque (_mm_and_si128 (src, mask)))
591 save_128_aligned ((__m128i *)pd, src);
595 __m128i dst = load_128_aligned ((__m128i *)pd);
596 __m128i dst_hi, dst_lo;
598 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
599 unpack_128_2x128 (src, &src_lo, &src_hi);
601 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
602 pix_multiply_2x128 (&src_lo, &src_hi,
606 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
608 expand_alpha_2x128 (src_lo, src_hi,
609 &alpha_lo, &alpha_hi);
611 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
616 pack_2x128_128 (dst_lo, dst_hi));
628 s = combine1 (ps, pm);
631 *pd = core_combine_over_u_pixel_sse2 (s, d);
640 static force_inline void
641 core_combine_over_u_sse2_no_mask (uint32_t * pd,
647 /* Align dst on a 16-byte boundary */
648 while (w && ((unsigned long)pd & 15))
654 *pd = core_combine_over_u_pixel_sse2 (s, d);
663 __m128i src_hi, src_lo, dst_hi, dst_lo;
664 __m128i alpha_hi, alpha_lo;
666 src = load_128_unaligned ((__m128i *)ps);
672 save_128_aligned ((__m128i *)pd, src);
676 __m128i dst = load_128_aligned ((__m128i *)pd);
678 unpack_128_2x128 (src, &src_lo, &src_hi);
679 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
681 expand_alpha_2x128 (src_lo, src_hi,
682 &alpha_lo, &alpha_hi);
683 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
688 pack_2x128_128 (dst_lo, dst_hi));
702 *pd = core_combine_over_u_pixel_sse2 (s, d);
710 static force_inline void
711 sse2_combine_over_u (pixman_implementation_t *imp,
719 core_combine_over_u_sse2_mask (pd, ps, pm, w);
721 core_combine_over_u_sse2_no_mask (pd, ps, w);
725 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
734 __m128i xmm_dst_lo, xmm_dst_hi;
735 __m128i xmm_src_lo, xmm_src_hi;
736 __m128i xmm_alpha_lo, xmm_alpha_hi;
738 /* Align dst on a 16-byte boundary */
740 ((unsigned long)pd & 15))
743 s = combine1 (ps, pm);
745 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
754 /* I'm loading unaligned because I'm not sure
755 * about the address alignment.
757 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
758 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
760 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
761 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
763 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
764 &xmm_alpha_lo, &xmm_alpha_hi);
766 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
767 &xmm_alpha_lo, &xmm_alpha_hi,
768 &xmm_src_lo, &xmm_src_hi);
770 /* rebuid the 4 pixel data and save*/
771 save_128_aligned ((__m128i*)pd,
772 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
785 s = combine1 (ps, pm);
787 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
795 static force_inline uint32_t
796 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
798 uint32_t maska = src >> 24;
804 else if (maska != 0xff)
806 return pack_1x128_32 (
807 pix_multiply_1x128 (unpack_32_1x128 (dst),
808 expand_alpha_1x128 (unpack_32_1x128 (src))));
815 sse2_combine_in_u (pixman_implementation_t *imp,
824 __m128i xmm_src_lo, xmm_src_hi;
825 __m128i xmm_dst_lo, xmm_dst_hi;
827 while (w && ((unsigned long) pd & 15))
829 s = combine1 (ps, pm);
832 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
841 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
842 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
844 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
845 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
847 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
848 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
849 &xmm_dst_lo, &xmm_dst_hi,
850 &xmm_dst_lo, &xmm_dst_hi);
852 save_128_aligned ((__m128i*)pd,
853 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
864 s = combine1 (ps, pm);
867 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
876 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
885 __m128i xmm_src_lo, xmm_src_hi;
886 __m128i xmm_dst_lo, xmm_dst_hi;
888 while (w && ((unsigned long) pd & 15))
890 s = combine1 (ps, pm);
893 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
902 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
903 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
905 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
906 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
908 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
909 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
910 &xmm_src_lo, &xmm_src_hi,
911 &xmm_dst_lo, &xmm_dst_hi);
914 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
925 s = combine1 (ps, pm);
928 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
937 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
944 while (w && ((unsigned long) pd & 15))
946 uint32_t s = combine1 (ps, pm);
949 *pd++ = pack_1x128_32 (
951 unpack_32_1x128 (d), negate_1x128 (
952 expand_alpha_1x128 (unpack_32_1x128 (s)))));
962 __m128i xmm_src_lo, xmm_src_hi;
963 __m128i xmm_dst_lo, xmm_dst_hi;
965 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
966 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
968 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
969 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
971 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
972 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
974 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
975 &xmm_src_lo, &xmm_src_hi,
976 &xmm_dst_lo, &xmm_dst_hi);
979 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
991 uint32_t s = combine1 (ps, pm);
994 *pd++ = pack_1x128_32 (
996 unpack_32_1x128 (d), negate_1x128 (
997 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1006 sse2_combine_out_u (pixman_implementation_t *imp,
1009 const uint32_t * ps,
1010 const uint32_t * pm,
1013 while (w && ((unsigned long) pd & 15))
1015 uint32_t s = combine1 (ps, pm);
1018 *pd++ = pack_1x128_32 (
1019 pix_multiply_1x128 (
1020 unpack_32_1x128 (s), negate_1x128 (
1021 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1030 __m128i xmm_src_lo, xmm_src_hi;
1031 __m128i xmm_dst_lo, xmm_dst_hi;
1033 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1034 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1036 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1037 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1039 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1040 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1042 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1043 &xmm_dst_lo, &xmm_dst_hi,
1044 &xmm_dst_lo, &xmm_dst_hi);
1047 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1058 uint32_t s = combine1 (ps, pm);
1061 *pd++ = pack_1x128_32 (
1062 pix_multiply_1x128 (
1063 unpack_32_1x128 (s), negate_1x128 (
1064 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1072 static force_inline uint32_t
1073 core_combine_atop_u_pixel_sse2 (uint32_t src,
1076 __m128i s = unpack_32_1x128 (src);
1077 __m128i d = unpack_32_1x128 (dst);
1079 __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1080 __m128i da = expand_alpha_1x128 (d);
1082 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1086 sse2_combine_atop_u (pixman_implementation_t *imp,
1089 const uint32_t * ps,
1090 const uint32_t * pm,
1095 __m128i xmm_src_lo, xmm_src_hi;
1096 __m128i xmm_dst_lo, xmm_dst_hi;
1097 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1098 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1100 while (w && ((unsigned long) pd & 15))
1102 s = combine1 (ps, pm);
1105 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1114 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1115 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1117 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1118 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1120 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1121 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1122 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1123 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1125 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1126 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1128 pix_add_multiply_2x128 (
1129 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1130 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1131 &xmm_dst_lo, &xmm_dst_hi);
1134 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1145 s = combine1 (ps, pm);
1148 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1156 static force_inline uint32_t
1157 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1160 __m128i s = unpack_32_1x128 (src);
1161 __m128i d = unpack_32_1x128 (dst);
1163 __m128i sa = expand_alpha_1x128 (s);
1164 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1166 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1170 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1173 const uint32_t * ps,
1174 const uint32_t * pm,
1179 __m128i xmm_src_lo, xmm_src_hi;
1180 __m128i xmm_dst_lo, xmm_dst_hi;
1181 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1182 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1184 while (w && ((unsigned long) pd & 15))
1186 s = combine1 (ps, pm);
1189 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1198 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1199 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1201 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1202 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1204 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1205 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1206 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1207 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1209 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1210 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1212 pix_add_multiply_2x128 (
1213 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1214 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1215 &xmm_dst_lo, &xmm_dst_hi);
1218 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1229 s = combine1 (ps, pm);
1232 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1240 static force_inline uint32_t
1241 core_combine_xor_u_pixel_sse2 (uint32_t src,
1244 __m128i s = unpack_32_1x128 (src);
1245 __m128i d = unpack_32_1x128 (dst);
1247 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1248 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1250 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1254 sse2_combine_xor_u (pixman_implementation_t *imp,
1257 const uint32_t * src,
1258 const uint32_t * mask,
1264 const uint32_t* ps = src;
1265 const uint32_t* pm = mask;
1267 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1268 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1269 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1270 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1272 while (w && ((unsigned long) pd & 15))
1274 s = combine1 (ps, pm);
1277 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1286 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1287 xmm_dst = load_128_aligned ((__m128i*) pd);
1289 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1290 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1292 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1293 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1294 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1295 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1297 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1298 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1299 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1300 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1302 pix_add_multiply_2x128 (
1303 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1304 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1305 &xmm_dst_lo, &xmm_dst_hi);
1308 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1319 s = combine1 (ps, pm);
1322 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1330 static force_inline void
1331 sse2_combine_add_u (pixman_implementation_t *imp,
1334 const uint32_t * src,
1335 const uint32_t * mask,
1341 const uint32_t* ps = src;
1342 const uint32_t* pm = mask;
1344 while (w && (unsigned long)pd & 15)
1346 s = combine1 (ps, pm);
1352 *pd++ = _mm_cvtsi128_si32 (
1353 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1361 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1364 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1375 s = combine1 (ps, pm);
1379 *pd++ = _mm_cvtsi128_si32 (
1380 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1386 static force_inline uint32_t
1387 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1390 __m128i ms = unpack_32_1x128 (src);
1391 __m128i md = unpack_32_1x128 (dst);
1392 uint32_t sa = src >> 24;
1393 uint32_t da = ~dst >> 24;
1397 ms = pix_multiply_1x128 (
1398 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1401 return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1405 sse2_combine_saturate_u (pixman_implementation_t *imp,
1408 const uint32_t * ps,
1409 const uint32_t * pm,
1415 __m128i xmm_src, xmm_dst;
1417 while (w && (unsigned long)pd & 15)
1419 s = combine1 (ps, pm);
1422 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1431 xmm_dst = load_128_aligned ((__m128i*)pd);
1432 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1434 pack_cmp = _mm_movemask_epi8 (
1436 _mm_srli_epi32 (xmm_src, 24),
1437 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1439 /* if some alpha src is grater than respective ~alpha dst */
1442 s = combine1 (ps++, pm);
1444 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1448 s = combine1 (ps++, pm);
1450 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1454 s = combine1 (ps++, pm);
1456 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1460 s = combine1 (ps++, pm);
1462 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1468 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1481 s = combine1 (ps, pm);
1484 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1492 sse2_combine_src_ca (pixman_implementation_t *imp,
1495 const uint32_t * ps,
1496 const uint32_t * pm,
1501 __m128i xmm_src_lo, xmm_src_hi;
1502 __m128i xmm_mask_lo, xmm_mask_hi;
1503 __m128i xmm_dst_lo, xmm_dst_hi;
1505 while (w && (unsigned long)pd & 15)
1509 *pd++ = pack_1x128_32 (
1510 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1516 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1517 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1519 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1520 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1522 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1523 &xmm_mask_lo, &xmm_mask_hi,
1524 &xmm_dst_lo, &xmm_dst_hi);
1527 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1539 *pd++ = pack_1x128_32 (
1540 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1545 static force_inline uint32_t
1546 core_combine_over_ca_pixel_sse2 (uint32_t src,
1550 __m128i s = unpack_32_1x128 (src);
1551 __m128i expAlpha = expand_alpha_1x128 (s);
1552 __m128i unpk_mask = unpack_32_1x128 (mask);
1553 __m128i unpk_dst = unpack_32_1x128 (dst);
1555 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1559 sse2_combine_over_ca (pixman_implementation_t *imp,
1562 const uint32_t * ps,
1563 const uint32_t * pm,
1568 __m128i xmm_alpha_lo, xmm_alpha_hi;
1569 __m128i xmm_src_lo, xmm_src_hi;
1570 __m128i xmm_dst_lo, xmm_dst_hi;
1571 __m128i xmm_mask_lo, xmm_mask_hi;
1573 while (w && (unsigned long)pd & 15)
1579 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1585 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1586 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1587 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1589 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1590 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1591 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1593 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1594 &xmm_alpha_lo, &xmm_alpha_hi);
1596 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1597 &xmm_alpha_lo, &xmm_alpha_hi,
1598 &xmm_mask_lo, &xmm_mask_hi,
1599 &xmm_dst_lo, &xmm_dst_hi);
1602 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1616 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1621 static force_inline uint32_t
1622 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1626 __m128i d = unpack_32_1x128 (dst);
1628 return pack_1x128_32 (
1629 over_1x128 (d, expand_alpha_1x128 (d),
1630 pix_multiply_1x128 (unpack_32_1x128 (src),
1631 unpack_32_1x128 (mask))));
1635 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1638 const uint32_t * ps,
1639 const uint32_t * pm,
1644 __m128i xmm_alpha_lo, xmm_alpha_hi;
1645 __m128i xmm_src_lo, xmm_src_hi;
1646 __m128i xmm_dst_lo, xmm_dst_hi;
1647 __m128i xmm_mask_lo, xmm_mask_hi;
1649 while (w && (unsigned long)pd & 15)
1655 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1661 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1662 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1663 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1665 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1666 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1667 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1669 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1670 &xmm_alpha_lo, &xmm_alpha_hi);
1671 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1672 &xmm_mask_lo, &xmm_mask_hi,
1673 &xmm_mask_lo, &xmm_mask_hi);
1675 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1676 &xmm_alpha_lo, &xmm_alpha_hi,
1677 &xmm_mask_lo, &xmm_mask_hi);
1680 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1694 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1700 sse2_combine_in_ca (pixman_implementation_t *imp,
1703 const uint32_t * ps,
1704 const uint32_t * pm,
1709 __m128i xmm_alpha_lo, xmm_alpha_hi;
1710 __m128i xmm_src_lo, xmm_src_hi;
1711 __m128i xmm_dst_lo, xmm_dst_hi;
1712 __m128i xmm_mask_lo, xmm_mask_hi;
1714 while (w && (unsigned long)pd & 15)
1720 *pd++ = pack_1x128_32 (
1721 pix_multiply_1x128 (
1722 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1723 expand_alpha_1x128 (unpack_32_1x128 (d))));
1730 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1731 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1732 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1734 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1735 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1736 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1738 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1739 &xmm_alpha_lo, &xmm_alpha_hi);
1741 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1742 &xmm_mask_lo, &xmm_mask_hi,
1743 &xmm_dst_lo, &xmm_dst_hi);
1745 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1746 &xmm_alpha_lo, &xmm_alpha_hi,
1747 &xmm_dst_lo, &xmm_dst_hi);
1750 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1764 *pd++ = pack_1x128_32 (
1765 pix_multiply_1x128 (
1766 pix_multiply_1x128 (
1767 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1768 expand_alpha_1x128 (unpack_32_1x128 (d))));
1775 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1778 const uint32_t * ps,
1779 const uint32_t * pm,
1784 __m128i xmm_alpha_lo, xmm_alpha_hi;
1785 __m128i xmm_src_lo, xmm_src_hi;
1786 __m128i xmm_dst_lo, xmm_dst_hi;
1787 __m128i xmm_mask_lo, xmm_mask_hi;
1789 while (w && (unsigned long)pd & 15)
1795 *pd++ = pack_1x128_32 (
1796 pix_multiply_1x128 (
1797 unpack_32_1x128 (d),
1798 pix_multiply_1x128 (unpack_32_1x128 (m),
1799 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1805 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1806 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1807 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1809 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1810 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1811 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1813 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1814 &xmm_alpha_lo, &xmm_alpha_hi);
1815 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1816 &xmm_alpha_lo, &xmm_alpha_hi,
1817 &xmm_alpha_lo, &xmm_alpha_hi);
1819 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1820 &xmm_alpha_lo, &xmm_alpha_hi,
1821 &xmm_dst_lo, &xmm_dst_hi);
1824 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1838 *pd++ = pack_1x128_32 (
1839 pix_multiply_1x128 (
1840 unpack_32_1x128 (d),
1841 pix_multiply_1x128 (unpack_32_1x128 (m),
1842 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1848 sse2_combine_out_ca (pixman_implementation_t *imp,
1851 const uint32_t * ps,
1852 const uint32_t * pm,
1857 __m128i xmm_alpha_lo, xmm_alpha_hi;
1858 __m128i xmm_src_lo, xmm_src_hi;
1859 __m128i xmm_dst_lo, xmm_dst_hi;
1860 __m128i xmm_mask_lo, xmm_mask_hi;
1862 while (w && (unsigned long)pd & 15)
1868 *pd++ = pack_1x128_32 (
1869 pix_multiply_1x128 (
1870 pix_multiply_1x128 (
1871 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1872 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1878 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1879 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1880 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1882 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1883 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1884 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1886 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1887 &xmm_alpha_lo, &xmm_alpha_hi);
1888 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1889 &xmm_alpha_lo, &xmm_alpha_hi);
1891 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1892 &xmm_mask_lo, &xmm_mask_hi,
1893 &xmm_dst_lo, &xmm_dst_hi);
1894 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1895 &xmm_alpha_lo, &xmm_alpha_hi,
1896 &xmm_dst_lo, &xmm_dst_hi);
1899 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1913 *pd++ = pack_1x128_32 (
1914 pix_multiply_1x128 (
1915 pix_multiply_1x128 (
1916 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1917 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1924 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1927 const uint32_t * ps,
1928 const uint32_t * pm,
1933 __m128i xmm_alpha_lo, xmm_alpha_hi;
1934 __m128i xmm_src_lo, xmm_src_hi;
1935 __m128i xmm_dst_lo, xmm_dst_hi;
1936 __m128i xmm_mask_lo, xmm_mask_hi;
1938 while (w && (unsigned long)pd & 15)
1944 *pd++ = pack_1x128_32 (
1945 pix_multiply_1x128 (
1946 unpack_32_1x128 (d),
1947 negate_1x128 (pix_multiply_1x128 (
1948 unpack_32_1x128 (m),
1949 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1955 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1956 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1957 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1959 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1960 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1961 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1963 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1964 &xmm_alpha_lo, &xmm_alpha_hi);
1966 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1967 &xmm_alpha_lo, &xmm_alpha_hi,
1968 &xmm_mask_lo, &xmm_mask_hi);
1970 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1971 &xmm_mask_lo, &xmm_mask_hi);
1973 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1974 &xmm_mask_lo, &xmm_mask_hi,
1975 &xmm_dst_lo, &xmm_dst_hi);
1978 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1992 *pd++ = pack_1x128_32 (
1993 pix_multiply_1x128 (
1994 unpack_32_1x128 (d),
1995 negate_1x128 (pix_multiply_1x128 (
1996 unpack_32_1x128 (m),
1997 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2002 static force_inline uint32_t
2003 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2007 __m128i m = unpack_32_1x128 (mask);
2008 __m128i s = unpack_32_1x128 (src);
2009 __m128i d = unpack_32_1x128 (dst);
2010 __m128i sa = expand_alpha_1x128 (s);
2011 __m128i da = expand_alpha_1x128 (d);
2013 s = pix_multiply_1x128 (s, m);
2014 m = negate_1x128 (pix_multiply_1x128 (m, sa));
2016 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2020 sse2_combine_atop_ca (pixman_implementation_t *imp,
2023 const uint32_t * ps,
2024 const uint32_t * pm,
2029 __m128i xmm_src_lo, xmm_src_hi;
2030 __m128i xmm_dst_lo, xmm_dst_hi;
2031 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2032 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2033 __m128i xmm_mask_lo, xmm_mask_hi;
2035 while (w && (unsigned long)pd & 15)
2041 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2047 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2048 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2049 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2051 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2052 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2053 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2055 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2056 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2057 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2058 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2060 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2061 &xmm_mask_lo, &xmm_mask_hi,
2062 &xmm_src_lo, &xmm_src_hi);
2063 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2064 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2065 &xmm_mask_lo, &xmm_mask_hi);
2067 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2069 pix_add_multiply_2x128 (
2070 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2071 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2072 &xmm_dst_lo, &xmm_dst_hi);
2075 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2089 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2094 static force_inline uint32_t
2095 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2099 __m128i m = unpack_32_1x128 (mask);
2100 __m128i s = unpack_32_1x128 (src);
2101 __m128i d = unpack_32_1x128 (dst);
2103 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2104 __m128i sa = expand_alpha_1x128 (s);
2106 s = pix_multiply_1x128 (s, m);
2107 m = pix_multiply_1x128 (m, sa);
2109 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2113 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2116 const uint32_t * ps,
2117 const uint32_t * pm,
2122 __m128i xmm_src_lo, xmm_src_hi;
2123 __m128i xmm_dst_lo, xmm_dst_hi;
2124 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2125 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2126 __m128i xmm_mask_lo, xmm_mask_hi;
2128 while (w && (unsigned long)pd & 15)
2134 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2140 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2141 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2142 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2144 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2145 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2146 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2148 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2149 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2150 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2151 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2153 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2154 &xmm_mask_lo, &xmm_mask_hi,
2155 &xmm_src_lo, &xmm_src_hi);
2156 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2157 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2158 &xmm_mask_lo, &xmm_mask_hi);
2160 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2161 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2163 pix_add_multiply_2x128 (
2164 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2165 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2166 &xmm_dst_lo, &xmm_dst_hi);
2169 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2183 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2188 static force_inline uint32_t
2189 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2193 __m128i a = unpack_32_1x128 (mask);
2194 __m128i s = unpack_32_1x128 (src);
2195 __m128i d = unpack_32_1x128 (dst);
2197 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2198 a, expand_alpha_1x128 (s)));
2199 __m128i dest = pix_multiply_1x128 (s, a);
2200 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2202 return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2209 sse2_combine_xor_ca (pixman_implementation_t *imp,
2212 const uint32_t * ps,
2213 const uint32_t * pm,
2218 __m128i xmm_src_lo, xmm_src_hi;
2219 __m128i xmm_dst_lo, xmm_dst_hi;
2220 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2221 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2222 __m128i xmm_mask_lo, xmm_mask_hi;
2224 while (w && (unsigned long)pd & 15)
2230 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2236 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2237 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2238 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2240 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2241 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2242 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2244 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2245 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2246 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2247 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2249 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2250 &xmm_mask_lo, &xmm_mask_hi,
2251 &xmm_src_lo, &xmm_src_hi);
2252 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2253 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2254 &xmm_mask_lo, &xmm_mask_hi);
2256 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2257 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2258 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2259 &xmm_mask_lo, &xmm_mask_hi);
2261 pix_add_multiply_2x128 (
2262 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2263 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2264 &xmm_dst_lo, &xmm_dst_hi);
2267 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2281 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2287 sse2_combine_add_ca (pixman_implementation_t *imp,
2290 const uint32_t * ps,
2291 const uint32_t * pm,
2296 __m128i xmm_src_lo, xmm_src_hi;
2297 __m128i xmm_dst_lo, xmm_dst_hi;
2298 __m128i xmm_mask_lo, xmm_mask_hi;
2300 while (w && (unsigned long)pd & 15)
2306 *pd++ = pack_1x128_32 (
2307 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2308 unpack_32_1x128 (m)),
2309 unpack_32_1x128 (d)));
2315 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2316 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2317 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2319 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2320 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2321 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2323 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2324 &xmm_mask_lo, &xmm_mask_hi,
2325 &xmm_src_lo, &xmm_src_hi);
2328 (__m128i*)pd, pack_2x128_128 (
2329 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2330 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2344 *pd++ = pack_1x128_32 (
2345 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2346 unpack_32_1x128 (m)),
2347 unpack_32_1x128 (d)));
2352 static force_inline __m128i
2353 create_mask_16_128 (uint16_t mask)
2355 return _mm_set1_epi16 (mask);
2358 /* Work around a code generation bug in Sun Studio 12. */
2359 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2360 # define create_mask_2x32_128(mask0, mask1) \
2361 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2363 static force_inline __m128i
2364 create_mask_2x32_128 (uint32_t mask0,
2367 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2372 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2374 pixman_image_t * src_image,
2375 pixman_image_t * mask_image,
2376 pixman_image_t * dst_image,
2387 uint32_t *dst_line, *dst, d;
2390 __m128i xmm_src, xmm_alpha;
2391 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2393 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2398 PIXMAN_IMAGE_GET_LINE (
2399 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2401 xmm_src = expand_pixel_32_1x128 (src);
2402 xmm_alpha = expand_alpha_1x128 (xmm_src);
2408 dst_line += dst_stride;
2411 while (w && (unsigned long)dst & 15)
2414 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2416 unpack_32_1x128 (d)));
2422 xmm_dst = load_128_aligned ((__m128i*)dst);
2424 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2426 over_2x128 (&xmm_src, &xmm_src,
2427 &xmm_alpha, &xmm_alpha,
2428 &xmm_dst_lo, &xmm_dst_hi);
2430 /* rebuid the 4 pixel data and save*/
2432 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2441 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2443 unpack_32_1x128 (d)));
2451 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2453 pixman_image_t * src_image,
2454 pixman_image_t * mask_image,
2455 pixman_image_t * dst_image,
2466 uint16_t *dst_line, *dst, d;
2469 __m128i xmm_src, xmm_alpha;
2470 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2472 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2477 PIXMAN_IMAGE_GET_LINE (
2478 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2480 xmm_src = expand_pixel_32_1x128 (src);
2481 xmm_alpha = expand_alpha_1x128 (xmm_src);
2487 dst_line += dst_stride;
2490 while (w && (unsigned long)dst & 15)
2494 *dst++ = pack_565_32_16 (
2495 pack_1x128_32 (over_1x128 (xmm_src,
2497 expand565_16_1x128 (d))));
2503 xmm_dst = load_128_aligned ((__m128i*)dst);
2505 unpack_565_128_4x128 (xmm_dst,
2506 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2508 over_2x128 (&xmm_src, &xmm_src,
2509 &xmm_alpha, &xmm_alpha,
2510 &xmm_dst0, &xmm_dst1);
2511 over_2x128 (&xmm_src, &xmm_src,
2512 &xmm_alpha, &xmm_alpha,
2513 &xmm_dst2, &xmm_dst3);
2515 xmm_dst = pack_565_4x128_128 (
2516 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2518 save_128_aligned ((__m128i*)dst, xmm_dst);
2527 *dst++ = pack_565_32_16 (
2528 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2529 expand565_16_1x128 (d))));
2536 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2538 pixman_image_t * src_image,
2539 pixman_image_t * mask_image,
2540 pixman_image_t * dst_image,
2551 uint32_t *dst_line, d;
2552 uint32_t *mask_line, m;
2554 int dst_stride, mask_stride;
2556 __m128i xmm_src, xmm_alpha;
2558 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2560 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2562 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2568 PIXMAN_IMAGE_GET_LINE (
2569 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2570 PIXMAN_IMAGE_GET_LINE (
2571 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2573 xmm_src = _mm_unpacklo_epi8 (
2574 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2575 xmm_alpha = expand_alpha_1x128 (xmm_src);
2577 mmx_alpha = xmm_alpha;
2582 const uint32_t *pm = (uint32_t *)mask_line;
2583 uint32_t *pd = (uint32_t *)dst_line;
2585 dst_line += dst_stride;
2586 mask_line += mask_stride;
2588 while (w && (unsigned long)pd & 15)
2596 mmx_mask = unpack_32_1x128 (m);
2597 mmx_dest = unpack_32_1x128 (d);
2599 *pd = pack_1x128_32 (
2600 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2609 xmm_mask = load_128_unaligned ((__m128i*)pm);
2613 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2615 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2616 if (pack_cmp != 0xffff)
2618 xmm_dst = load_128_aligned ((__m128i*)pd);
2620 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2622 pix_multiply_2x128 (&xmm_src, &xmm_src,
2623 &xmm_mask_lo, &xmm_mask_hi,
2624 &xmm_mask_lo, &xmm_mask_hi);
2625 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2628 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2644 mmx_mask = unpack_32_1x128 (m);
2645 mmx_dest = unpack_32_1x128 (d);
2647 *pd = pack_1x128_32 (
2648 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2659 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2661 pixman_image_t * src_image,
2662 pixman_image_t * mask_image,
2663 pixman_image_t * dst_image,
2674 uint32_t *dst_line, d;
2675 uint32_t *mask_line, m;
2677 int dst_stride, mask_stride;
2679 __m128i xmm_src, xmm_alpha;
2680 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2681 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2683 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2685 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2690 PIXMAN_IMAGE_GET_LINE (
2691 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2692 PIXMAN_IMAGE_GET_LINE (
2693 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2695 xmm_src = _mm_unpacklo_epi8 (
2696 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2697 xmm_alpha = expand_alpha_1x128 (xmm_src);
2699 mmx_alpha = xmm_alpha;
2704 const uint32_t *pm = (uint32_t *)mask_line;
2705 uint32_t *pd = (uint32_t *)dst_line;
2707 dst_line += dst_stride;
2708 mask_line += mask_stride;
2710 while (w && (unsigned long)pd & 15)
2717 mmx_mask = unpack_32_1x128 (m);
2718 mmx_dest = unpack_32_1x128 (d);
2720 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2732 xmm_mask = load_128_unaligned ((__m128i*)pm);
2736 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2738 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2739 if (pack_cmp != 0xffff)
2741 xmm_dst = load_128_aligned ((__m128i*)pd);
2743 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2744 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2746 in_over_2x128 (&xmm_src, &xmm_src,
2747 &xmm_alpha, &xmm_alpha,
2748 &xmm_mask_lo, &xmm_mask_hi,
2749 &xmm_dst_lo, &xmm_dst_hi);
2752 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2767 mmx_mask = unpack_32_1x128 (m);
2768 mmx_dest = unpack_32_1x128 (d);
2770 *pd = pack_1x128_32 (
2771 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2782 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2784 pixman_image_t * src_image,
2785 pixman_image_t * mask_image,
2786 pixman_image_t * dst_image,
2796 uint32_t *dst_line, *dst;
2797 uint32_t *src_line, *src;
2800 int dst_stride, src_stride;
2803 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2804 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2805 __m128i xmm_alpha_lo, xmm_alpha_hi;
2807 PIXMAN_IMAGE_GET_LINE (
2808 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2809 PIXMAN_IMAGE_GET_LINE (
2810 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2812 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2814 xmm_mask = create_mask_16_128 (mask >> 24);
2819 dst_line += dst_stride;
2821 src_line += src_stride;
2824 while (w && (unsigned long)dst & 15)
2826 uint32_t s = *src++;
2832 __m128i ms = unpack_32_1x128 (s);
2833 __m128i alpha = expand_alpha_1x128 (ms);
2834 __m128i dest = xmm_mask;
2835 __m128i alpha_dst = unpack_32_1x128 (d);
2837 *dst = pack_1x128_32 (
2838 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2846 xmm_src = load_128_unaligned ((__m128i*)src);
2848 if (!is_zero (xmm_src))
2850 xmm_dst = load_128_aligned ((__m128i*)dst);
2852 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2853 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2854 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2855 &xmm_alpha_lo, &xmm_alpha_hi);
2857 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2858 &xmm_alpha_lo, &xmm_alpha_hi,
2859 &xmm_mask, &xmm_mask,
2860 &xmm_dst_lo, &xmm_dst_hi);
2863 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2873 uint32_t s = *src++;
2879 __m128i ms = unpack_32_1x128 (s);
2880 __m128i alpha = expand_alpha_1x128 (ms);
2881 __m128i mask = xmm_mask;
2882 __m128i dest = unpack_32_1x128 (d);
2884 *dst = pack_1x128_32 (
2885 in_over_1x128 (&ms, &alpha, &mask, &dest));
2896 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2898 pixman_image_t * src_image,
2899 pixman_image_t * mask_image,
2900 pixman_image_t * dst_image,
2910 uint32_t *dst_line, *dst;
2911 uint32_t *src_line, *src;
2913 int dst_stride, src_stride;
2916 PIXMAN_IMAGE_GET_LINE (
2917 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2918 PIXMAN_IMAGE_GET_LINE (
2919 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2924 dst_line += dst_stride;
2926 src_line += src_stride;
2929 while (w && (unsigned long)dst & 15)
2931 *dst++ = *src++ | 0xff000000;
2937 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2939 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2940 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2941 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2942 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2944 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2945 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2946 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2947 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2956 *dst++ = *src++ | 0xff000000;
2964 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2966 pixman_image_t * src_image,
2967 pixman_image_t * mask_image,
2968 pixman_image_t * dst_image,
2978 uint32_t *dst_line, *dst;
2979 uint32_t *src_line, *src;
2981 int dst_stride, src_stride;
2984 __m128i xmm_mask, xmm_alpha;
2985 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2986 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2988 PIXMAN_IMAGE_GET_LINE (
2989 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2990 PIXMAN_IMAGE_GET_LINE (
2991 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2993 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2995 xmm_mask = create_mask_16_128 (mask >> 24);
2996 xmm_alpha = mask_00ff;
3001 dst_line += dst_stride;
3003 src_line += src_stride;
3006 while (w && (unsigned long)dst & 15)
3008 uint32_t s = (*src++) | 0xff000000;
3011 __m128i src = unpack_32_1x128 (s);
3012 __m128i alpha = xmm_alpha;
3013 __m128i mask = xmm_mask;
3014 __m128i dest = unpack_32_1x128 (d);
3016 *dst++ = pack_1x128_32 (
3017 in_over_1x128 (&src, &alpha, &mask, &dest));
3024 xmm_src = _mm_or_si128 (
3025 load_128_unaligned ((__m128i*)src), mask_ff000000);
3026 xmm_dst = load_128_aligned ((__m128i*)dst);
3028 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3029 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3031 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3032 &xmm_alpha, &xmm_alpha,
3033 &xmm_mask, &xmm_mask,
3034 &xmm_dst_lo, &xmm_dst_hi);
3037 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3047 uint32_t s = (*src++) | 0xff000000;
3050 __m128i src = unpack_32_1x128 (s);
3051 __m128i alpha = xmm_alpha;
3052 __m128i mask = xmm_mask;
3053 __m128i dest = unpack_32_1x128 (d);
3055 *dst++ = pack_1x128_32 (
3056 in_over_1x128 (&src, &alpha, &mask, &dest));
3065 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3067 pixman_image_t * src_image,
3068 pixman_image_t * mask_image,
3069 pixman_image_t * dst_image,
3079 int dst_stride, src_stride;
3080 uint32_t *dst_line, *dst;
3081 uint32_t *src_line, *src;
3083 PIXMAN_IMAGE_GET_LINE (
3084 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3085 PIXMAN_IMAGE_GET_LINE (
3086 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3093 sse2_combine_over_u (imp, op, dst, src, NULL, width);
3100 static force_inline uint16_t
3101 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3105 ms = unpack_32_1x128 (src);
3106 return pack_565_32_16 (
3109 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3113 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3115 pixman_image_t * src_image,
3116 pixman_image_t * mask_image,
3117 pixman_image_t * dst_image,
3127 uint16_t *dst_line, *dst, d;
3128 uint32_t *src_line, *src, s;
3129 int dst_stride, src_stride;
3132 __m128i xmm_alpha_lo, xmm_alpha_hi;
3133 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3134 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3136 PIXMAN_IMAGE_GET_LINE (
3137 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3138 PIXMAN_IMAGE_GET_LINE (
3139 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3146 dst_line += dst_stride;
3147 src_line += src_stride;
3150 /* Align dst on a 16-byte boundary */
3152 ((unsigned long)dst & 15))
3157 *dst++ = composite_over_8888_0565pixel (s, d);
3161 /* It's a 8 pixel loop */
3164 /* I'm loading unaligned because I'm not sure
3165 * about the address alignment.
3167 xmm_src = load_128_unaligned ((__m128i*) src);
3168 xmm_dst = load_128_aligned ((__m128i*) dst);
3171 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3172 unpack_565_128_4x128 (xmm_dst,
3173 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3174 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3175 &xmm_alpha_lo, &xmm_alpha_hi);
3177 /* I'm loading next 4 pixels from memory
3178 * before to optimze the memory read.
3180 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3182 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3183 &xmm_alpha_lo, &xmm_alpha_hi,
3184 &xmm_dst0, &xmm_dst1);
3187 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3188 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3189 &xmm_alpha_lo, &xmm_alpha_hi);
3191 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3192 &xmm_alpha_lo, &xmm_alpha_hi,
3193 &xmm_dst2, &xmm_dst3);
3196 (__m128i*)dst, pack_565_4x128_128 (
3197 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3209 *dst++ = composite_over_8888_0565pixel (s, d);
3216 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3218 pixman_image_t * src_image,
3219 pixman_image_t * mask_image,
3220 pixman_image_t * dst_image,
3231 uint32_t *dst_line, *dst;
3232 uint8_t *mask_line, *mask;
3233 int dst_stride, mask_stride;
3237 __m128i xmm_src, xmm_alpha, xmm_def;
3238 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3239 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3241 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3243 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3249 PIXMAN_IMAGE_GET_LINE (
3250 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3251 PIXMAN_IMAGE_GET_LINE (
3252 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3254 xmm_def = create_mask_2x32_128 (src, src);
3255 xmm_src = expand_pixel_32_1x128 (src);
3256 xmm_alpha = expand_alpha_1x128 (xmm_src);
3258 mmx_alpha = xmm_alpha;
3263 dst_line += dst_stride;
3265 mask_line += mask_stride;
3268 while (w && (unsigned long)dst & 15)
3270 uint8_t m = *mask++;
3275 mmx_mask = expand_pixel_8_1x128 (m);
3276 mmx_dest = unpack_32_1x128 (d);
3278 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3290 m = *((uint32_t*)mask);
3292 if (srca == 0xff && m == 0xffffffff)
3294 save_128_aligned ((__m128i*)dst, xmm_def);
3298 xmm_dst = load_128_aligned ((__m128i*) dst);
3299 xmm_mask = unpack_32_1x128 (m);
3300 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3303 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3304 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3306 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3307 &xmm_mask_lo, &xmm_mask_hi);
3309 in_over_2x128 (&xmm_src, &xmm_src,
3310 &xmm_alpha, &xmm_alpha,
3311 &xmm_mask_lo, &xmm_mask_hi,
3312 &xmm_dst_lo, &xmm_dst_hi);
3315 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3325 uint8_t m = *mask++;
3330 mmx_mask = expand_pixel_8_1x128 (m);
3331 mmx_dest = unpack_32_1x128 (d);
3333 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3347 pixman_fill_sse2 (uint32_t *bits,
3356 uint32_t byte_width;
3366 stride = stride * (int) sizeof (uint32_t) / 1;
3367 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3373 data = (w << 16) | w;
3377 stride = stride * (int) sizeof (uint32_t) / 2;
3378 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3379 byte_width = 2 * width;
3382 data = (data & 0xffff) * 0x00010001;
3386 stride = stride * (int) sizeof (uint32_t) / 4;
3387 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3388 byte_width = 4 * width;
3396 xmm_def = create_mask_2x32_128 (data, data);
3401 uint8_t *d = byte_line;
3402 byte_line += stride;
3405 while (w >= 1 && ((unsigned long)d & 1))
3407 *(uint8_t *)d = data;
3412 while (w >= 2 && ((unsigned long)d & 3))
3414 *(uint16_t *)d = data;
3419 while (w >= 4 && ((unsigned long)d & 15))
3421 *(uint32_t *)d = data;
3429 save_128_aligned ((__m128i*)(d), xmm_def);
3430 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3431 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3432 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3433 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3434 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3435 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3436 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3444 save_128_aligned ((__m128i*)(d), xmm_def);
3445 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3446 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3447 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3455 save_128_aligned ((__m128i*)(d), xmm_def);
3456 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3464 save_128_aligned ((__m128i*)(d), xmm_def);
3472 *(uint32_t *)d = data;
3480 *(uint16_t *)d = data;
3487 *(uint8_t *)d = data;
3497 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3499 pixman_image_t * src_image,
3500 pixman_image_t * mask_image,
3501 pixman_image_t * dst_image,
3512 uint32_t *dst_line, *dst;
3513 uint8_t *mask_line, *mask;
3514 int dst_stride, mask_stride;
3518 __m128i xmm_src, xmm_def;
3519 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3521 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3526 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3527 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3528 dest_x, dest_y, width, height, 0);
3532 PIXMAN_IMAGE_GET_LINE (
3533 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3534 PIXMAN_IMAGE_GET_LINE (
3535 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3537 xmm_def = create_mask_2x32_128 (src, src);
3538 xmm_src = expand_pixel_32_1x128 (src);
3543 dst_line += dst_stride;
3545 mask_line += mask_stride;
3548 while (w && (unsigned long)dst & 15)
3550 uint8_t m = *mask++;
3554 *dst = pack_1x128_32 (
3555 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3568 m = *((uint32_t*)mask);
3570 if (srca == 0xff && m == 0xffffffff)
3572 save_128_aligned ((__m128i*)dst, xmm_def);
3576 xmm_mask = unpack_32_1x128 (m);
3577 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3580 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3582 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3583 &xmm_mask_lo, &xmm_mask_hi);
3585 pix_multiply_2x128 (&xmm_src, &xmm_src,
3586 &xmm_mask_lo, &xmm_mask_hi,
3587 &xmm_mask_lo, &xmm_mask_hi);
3590 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3594 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3604 uint8_t m = *mask++;
3608 *dst = pack_1x128_32 (
3609 pix_multiply_1x128 (
3610 xmm_src, expand_pixel_8_1x128 (m)));
3625 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3627 pixman_image_t * src_image,
3628 pixman_image_t * mask_image,
3629 pixman_image_t * dst_image,
3640 uint16_t *dst_line, *dst, d;
3641 uint8_t *mask_line, *mask;
3642 int dst_stride, mask_stride;
3645 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3647 __m128i xmm_src, xmm_alpha;
3648 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3649 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3651 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3657 PIXMAN_IMAGE_GET_LINE (
3658 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3659 PIXMAN_IMAGE_GET_LINE (
3660 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3662 xmm_src = expand_pixel_32_1x128 (src);
3663 xmm_alpha = expand_alpha_1x128 (xmm_src);
3665 mmx_alpha = xmm_alpha;
3670 dst_line += dst_stride;
3672 mask_line += mask_stride;
3675 while (w && (unsigned long)dst & 15)
3682 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3683 mmx_dest = expand565_16_1x128 (d);
3685 *dst = pack_565_32_16 (
3688 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3697 xmm_dst = load_128_aligned ((__m128i*) dst);
3698 unpack_565_128_4x128 (xmm_dst,
3699 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3701 m = *((uint32_t*)mask);
3706 xmm_mask = unpack_32_1x128 (m);
3707 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3710 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3712 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3713 &xmm_mask_lo, &xmm_mask_hi);
3715 in_over_2x128 (&xmm_src, &xmm_src,
3716 &xmm_alpha, &xmm_alpha,
3717 &xmm_mask_lo, &xmm_mask_hi,
3718 &xmm_dst0, &xmm_dst1);
3721 m = *((uint32_t*)mask);
3726 xmm_mask = unpack_32_1x128 (m);
3727 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3730 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3732 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3733 &xmm_mask_lo, &xmm_mask_hi);
3734 in_over_2x128 (&xmm_src, &xmm_src,
3735 &xmm_alpha, &xmm_alpha,
3736 &xmm_mask_lo, &xmm_mask_hi,
3737 &xmm_dst2, &xmm_dst3);
3741 (__m128i*)dst, pack_565_4x128_128 (
3742 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3755 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3756 mmx_dest = expand565_16_1x128 (d);
3758 *dst = pack_565_32_16 (
3761 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3772 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3774 pixman_image_t * src_image,
3775 pixman_image_t * mask_image,
3776 pixman_image_t * dst_image,
3786 uint16_t *dst_line, *dst, d;
3787 uint32_t *src_line, *src, s;
3788 int dst_stride, src_stride;
3790 uint32_t opaque, zero;
3793 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3794 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3796 PIXMAN_IMAGE_GET_LINE (
3797 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3798 PIXMAN_IMAGE_GET_LINE (
3799 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3804 dst_line += dst_stride;
3806 src_line += src_stride;
3809 while (w && (unsigned long)dst & 15)
3814 ms = unpack_32_1x128 (s);
3816 *dst++ = pack_565_32_16 (
3818 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3825 xmm_src = load_128_unaligned ((__m128i*)src);
3826 xmm_dst = load_128_aligned ((__m128i*)dst);
3828 opaque = is_opaque (xmm_src);
3829 zero = is_zero (xmm_src);
3831 unpack_565_128_4x128 (xmm_dst,
3832 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3833 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3835 /* preload next round*/
3836 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3840 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3841 &xmm_dst0, &xmm_dst1);
3845 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3846 &xmm_dst0, &xmm_dst1);
3850 opaque = is_opaque (xmm_src);
3851 zero = is_zero (xmm_src);
3853 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3857 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3858 &xmm_dst2, &xmm_dst3);
3862 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3863 &xmm_dst2, &xmm_dst3);
3867 (__m128i*)dst, pack_565_4x128_128 (
3868 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3880 ms = unpack_32_1x128 (s);
3882 *dst++ = pack_565_32_16 (
3884 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3892 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3894 pixman_image_t * src_image,
3895 pixman_image_t * mask_image,
3896 pixman_image_t * dst_image,
3906 uint32_t *dst_line, *dst, d;
3907 uint32_t *src_line, *src, s;
3908 int dst_stride, src_stride;
3910 uint32_t opaque, zero;
3912 __m128i xmm_src_lo, xmm_src_hi;
3913 __m128i xmm_dst_lo, xmm_dst_hi;
3915 PIXMAN_IMAGE_GET_LINE (
3916 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3917 PIXMAN_IMAGE_GET_LINE (
3918 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3923 dst_line += dst_stride;
3925 src_line += src_stride;
3928 while (w && (unsigned long)dst & 15)
3933 *dst++ = pack_1x128_32 (
3934 over_rev_non_pre_1x128 (
3935 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3942 xmm_src_hi = load_128_unaligned ((__m128i*)src);
3944 opaque = is_opaque (xmm_src_hi);
3945 zero = is_zero (xmm_src_hi);
3947 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3951 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3952 &xmm_dst_lo, &xmm_dst_hi);
3955 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3959 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
3961 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3963 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3964 &xmm_dst_lo, &xmm_dst_hi);
3967 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3980 *dst++ = pack_1x128_32 (
3981 over_rev_non_pre_1x128 (
3982 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3991 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3993 pixman_image_t * src_image,
3994 pixman_image_t * mask_image,
3995 pixman_image_t * dst_image,
4006 uint16_t *dst_line, *dst, d;
4007 uint32_t *mask_line, *mask, m;
4008 int dst_stride, mask_stride;
4012 __m128i xmm_src, xmm_alpha;
4013 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4014 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4016 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4018 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4023 PIXMAN_IMAGE_GET_LINE (
4024 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4025 PIXMAN_IMAGE_GET_LINE (
4026 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4028 xmm_src = expand_pixel_32_1x128 (src);
4029 xmm_alpha = expand_alpha_1x128 (xmm_src);
4031 mmx_alpha = xmm_alpha;
4038 mask_line += mask_stride;
4039 dst_line += dst_stride;
4041 while (w && ((unsigned long)dst & 15))
4043 m = *(uint32_t *) mask;
4048 mmx_mask = unpack_32_1x128 (m);
4049 mmx_dest = expand565_16_1x128 (d);
4051 *dst = pack_565_32_16 (
4054 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4065 xmm_mask = load_128_unaligned ((__m128i*)mask);
4066 xmm_dst = load_128_aligned ((__m128i*)dst);
4068 pack_cmp = _mm_movemask_epi8 (
4069 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4071 unpack_565_128_4x128 (xmm_dst,
4072 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4073 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4075 /* preload next round */
4076 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4078 /* preload next round */
4079 if (pack_cmp != 0xffff)
4081 in_over_2x128 (&xmm_src, &xmm_src,
4082 &xmm_alpha, &xmm_alpha,
4083 &xmm_mask_lo, &xmm_mask_hi,
4084 &xmm_dst0, &xmm_dst1);
4088 pack_cmp = _mm_movemask_epi8 (
4089 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4091 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4093 if (pack_cmp != 0xffff)
4095 in_over_2x128 (&xmm_src, &xmm_src,
4096 &xmm_alpha, &xmm_alpha,
4097 &xmm_mask_lo, &xmm_mask_hi,
4098 &xmm_dst2, &xmm_dst3);
4102 (__m128i*)dst, pack_565_4x128_128 (
4103 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4112 m = *(uint32_t *) mask;
4117 mmx_mask = unpack_32_1x128 (m);
4118 mmx_dest = expand565_16_1x128 (d);
4120 *dst = pack_565_32_16 (
4123 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4135 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4137 pixman_image_t * src_image,
4138 pixman_image_t * mask_image,
4139 pixman_image_t * dst_image,
4149 uint8_t *dst_line, *dst;
4150 uint8_t *mask_line, *mask;
4151 int dst_stride, mask_stride;
4158 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4159 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4161 PIXMAN_IMAGE_GET_LINE (
4162 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4163 PIXMAN_IMAGE_GET_LINE (
4164 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4166 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4170 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4175 dst_line += dst_stride;
4177 mask_line += mask_stride;
4180 while (w && ((unsigned long)dst & 15))
4182 m = (uint32_t) *mask++;
4183 d = (uint32_t) *dst;
4185 *dst++ = (uint8_t) pack_1x128_32 (
4186 pix_multiply_1x128 (
4187 pix_multiply_1x128 (xmm_alpha,
4188 unpack_32_1x128 (m)),
4189 unpack_32_1x128 (d)));
4195 xmm_mask = load_128_unaligned ((__m128i*)mask);
4196 xmm_dst = load_128_aligned ((__m128i*)dst);
4198 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4199 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4201 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4202 &xmm_mask_lo, &xmm_mask_hi,
4203 &xmm_mask_lo, &xmm_mask_hi);
4205 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4206 &xmm_dst_lo, &xmm_dst_hi,
4207 &xmm_dst_lo, &xmm_dst_hi);
4210 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4219 m = (uint32_t) *mask++;
4220 d = (uint32_t) *dst;
4222 *dst++ = (uint8_t) pack_1x128_32 (
4223 pix_multiply_1x128 (
4224 pix_multiply_1x128 (
4225 xmm_alpha, unpack_32_1x128 (m)),
4226 unpack_32_1x128 (d)));
4234 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4236 pixman_image_t * src_image,
4237 pixman_image_t * mask_image,
4238 pixman_image_t * dst_image,
4248 uint8_t *dst_line, *dst;
4255 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4257 PIXMAN_IMAGE_GET_LINE (
4258 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4260 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4262 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4271 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4272 8, dest_x, dest_y, width, height, src);
4280 dst_line += dst_stride;
4283 while (w && ((unsigned long)dst & 15))
4285 d = (uint32_t) *dst;
4287 *dst++ = (uint8_t) pack_1x128_32 (
4288 pix_multiply_1x128 (
4290 unpack_32_1x128 (d)));
4296 xmm_dst = load_128_aligned ((__m128i*)dst);
4298 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4300 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4301 &xmm_dst_lo, &xmm_dst_hi,
4302 &xmm_dst_lo, &xmm_dst_hi);
4305 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4313 d = (uint32_t) *dst;
4315 *dst++ = (uint8_t) pack_1x128_32 (
4316 pix_multiply_1x128 (
4318 unpack_32_1x128 (d)));
4326 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4328 pixman_image_t * src_image,
4329 pixman_image_t * mask_image,
4330 pixman_image_t * dst_image,
4340 uint8_t *dst_line, *dst;
4341 uint8_t *src_line, *src;
4342 int src_stride, dst_stride;
4346 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4347 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4349 PIXMAN_IMAGE_GET_LINE (
4350 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4351 PIXMAN_IMAGE_GET_LINE (
4352 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4357 dst_line += dst_stride;
4359 src_line += src_stride;
4362 while (w && ((unsigned long)dst & 15))
4364 s = (uint32_t) *src++;
4365 d = (uint32_t) *dst;
4367 *dst++ = (uint8_t) pack_1x128_32 (
4368 pix_multiply_1x128 (
4369 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4375 xmm_src = load_128_unaligned ((__m128i*)src);
4376 xmm_dst = load_128_aligned ((__m128i*)dst);
4378 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4379 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4381 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4382 &xmm_dst_lo, &xmm_dst_hi,
4383 &xmm_dst_lo, &xmm_dst_hi);
4386 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4395 s = (uint32_t) *src++;
4396 d = (uint32_t) *dst;
4398 *dst++ = (uint8_t) pack_1x128_32 (
4399 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4407 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4409 pixman_image_t * src_image,
4410 pixman_image_t * mask_image,
4411 pixman_image_t * dst_image,
4421 uint8_t *dst_line, *dst;
4422 uint8_t *mask_line, *mask;
4423 int dst_stride, mask_stride;
4430 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4431 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4433 PIXMAN_IMAGE_GET_LINE (
4434 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4435 PIXMAN_IMAGE_GET_LINE (
4436 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4438 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4442 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4447 dst_line += dst_stride;
4449 mask_line += mask_stride;
4452 while (w && ((unsigned long)dst & 15))
4454 m = (uint32_t) *mask++;
4455 d = (uint32_t) *dst;
4457 *dst++ = (uint8_t) pack_1x128_32 (
4459 pix_multiply_1x128 (
4460 xmm_alpha, unpack_32_1x128 (m)),
4461 unpack_32_1x128 (d)));
4467 xmm_mask = load_128_unaligned ((__m128i*)mask);
4468 xmm_dst = load_128_aligned ((__m128i*)dst);
4470 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4471 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4473 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4474 &xmm_mask_lo, &xmm_mask_hi,
4475 &xmm_mask_lo, &xmm_mask_hi);
4477 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4478 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4481 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4490 m = (uint32_t) *mask++;
4491 d = (uint32_t) *dst;
4493 *dst++ = (uint8_t) pack_1x128_32 (
4495 pix_multiply_1x128 (
4496 xmm_alpha, unpack_32_1x128 (m)),
4497 unpack_32_1x128 (d)));
4506 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4508 pixman_image_t * src_image,
4509 pixman_image_t * mask_image,
4510 pixman_image_t * dst_image,
4520 uint8_t *dst_line, *dst;
4527 PIXMAN_IMAGE_GET_LINE (
4528 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4530 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4539 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4540 8, dest_x, dest_y, width, height, 0xff);
4545 src = (src << 24) | (src << 16) | (src << 8) | src;
4546 xmm_src = _mm_set_epi32 (src, src, src, src);
4551 dst_line += dst_stride;
4554 while (w && ((unsigned long)dst & 15))
4556 *dst = (uint8_t)_mm_cvtsi128_si32 (
4559 _mm_cvtsi32_si128 (*dst)));
4568 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4576 *dst = (uint8_t)_mm_cvtsi128_si32 (
4579 _mm_cvtsi32_si128 (*dst)));
4589 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4591 pixman_image_t * src_image,
4592 pixman_image_t * mask_image,
4593 pixman_image_t * dst_image,
4603 uint8_t *dst_line, *dst;
4604 uint8_t *src_line, *src;
4605 int dst_stride, src_stride;
4609 PIXMAN_IMAGE_GET_LINE (
4610 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4611 PIXMAN_IMAGE_GET_LINE (
4612 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4619 dst_line += dst_stride;
4620 src_line += src_stride;
4624 while (w && (unsigned long)dst & 3)
4626 t = (*dst) + (*src++);
4627 *dst++ = t | (0 - (t >> 8));
4631 sse2_combine_add_u (imp, op,
4632 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4642 t = (*dst) + (*src++);
4643 *dst++ = t | (0 - (t >> 8));
4651 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4653 pixman_image_t * src_image,
4654 pixman_image_t * mask_image,
4655 pixman_image_t * dst_image,
4665 uint32_t *dst_line, *dst;
4666 uint32_t *src_line, *src;
4667 int dst_stride, src_stride;
4669 PIXMAN_IMAGE_GET_LINE (
4670 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4671 PIXMAN_IMAGE_GET_LINE (
4672 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4677 dst_line += dst_stride;
4679 src_line += src_stride;
4681 sse2_combine_add_u (imp, op, dst, src, NULL, width);
4686 static pixman_bool_t
4687 pixman_blt_sse2 (uint32_t *src_bits,
4700 uint8_t * src_bytes;
4701 uint8_t * dst_bytes;
4704 if (src_bpp != dst_bpp)
4709 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4710 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4711 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4712 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4713 byte_width = 2 * width;
4717 else if (src_bpp == 32)
4719 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4720 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4721 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4722 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4723 byte_width = 4 * width;
4735 uint8_t *s = src_bytes;
4736 uint8_t *d = dst_bytes;
4737 src_bytes += src_stride;
4738 dst_bytes += dst_stride;
4741 while (w >= 2 && ((unsigned long)d & 3))
4743 *(uint16_t *)d = *(uint16_t *)s;
4749 while (w >= 4 && ((unsigned long)d & 15))
4751 *(uint32_t *)d = *(uint32_t *)s;
4760 __m128i xmm0, xmm1, xmm2, xmm3;
4762 xmm0 = load_128_unaligned ((__m128i*)(s));
4763 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4764 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4765 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4767 save_128_aligned ((__m128i*)(d), xmm0);
4768 save_128_aligned ((__m128i*)(d + 16), xmm1);
4769 save_128_aligned ((__m128i*)(d + 32), xmm2);
4770 save_128_aligned ((__m128i*)(d + 48), xmm3);
4779 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4788 *(uint32_t *)d = *(uint32_t *)s;
4797 *(uint16_t *)d = *(uint16_t *)s;
4809 sse2_composite_copy_area (pixman_implementation_t *imp,
4811 pixman_image_t * src_image,
4812 pixman_image_t * mask_image,
4813 pixman_image_t * dst_image,
4823 pixman_blt_sse2 (src_image->bits.bits,
4824 dst_image->bits.bits,
4825 src_image->bits.rowstride,
4826 dst_image->bits.rowstride,
4827 PIXMAN_FORMAT_BPP (src_image->bits.format),
4828 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4829 src_x, src_y, dest_x, dest_y, width, height);
4833 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4835 pixman_image_t * src_image,
4836 pixman_image_t * mask_image,
4837 pixman_image_t * dst_image,
4847 uint32_t *src, *src_line, s;
4848 uint32_t *dst, *dst_line, d;
4849 uint8_t *mask, *mask_line;
4851 int src_stride, mask_stride, dst_stride;
4855 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4856 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4857 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4859 PIXMAN_IMAGE_GET_LINE (
4860 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4861 PIXMAN_IMAGE_GET_LINE (
4862 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4863 PIXMAN_IMAGE_GET_LINE (
4864 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4869 src_line += src_stride;
4871 dst_line += dst_stride;
4873 mask_line += mask_stride;
4877 while (w && (unsigned long)dst & 15)
4879 s = 0xff000000 | *src++;
4880 m = (uint32_t) *mask++;
4882 ms = unpack_32_1x128 (s);
4886 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4887 __m128i md = unpack_32_1x128 (d);
4889 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4892 *dst++ = pack_1x128_32 (ms);
4898 m = *(uint32_t*) mask;
4899 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
4901 if (m == 0xffffffff)
4903 save_128_aligned ((__m128i*)dst, xmm_src);
4907 xmm_dst = load_128_aligned ((__m128i*)dst);
4909 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4911 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4912 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4913 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4915 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4917 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
4919 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4930 m = (uint32_t) *mask++;
4934 s = 0xff000000 | *src;
4946 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4947 md = unpack_32_1x128 (d);
4948 ms = unpack_32_1x128 (s);
4950 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4964 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4966 pixman_image_t * src_image,
4967 pixman_image_t * mask_image,
4968 pixman_image_t * dst_image,
4978 uint32_t *src, *src_line, s;
4979 uint32_t *dst, *dst_line, d;
4980 uint8_t *mask, *mask_line;
4982 int src_stride, mask_stride, dst_stride;
4985 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4986 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4987 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4989 PIXMAN_IMAGE_GET_LINE (
4990 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4991 PIXMAN_IMAGE_GET_LINE (
4992 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4993 PIXMAN_IMAGE_GET_LINE (
4994 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4999 src_line += src_stride;
5001 dst_line += dst_stride;
5003 mask_line += mask_stride;
5007 while (w && (unsigned long)dst & 15)
5012 m = (uint32_t) *mask++;
5019 if (sa == 0xff && m == 0xff)
5025 __m128i ms, md, ma, msa;
5027 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5028 ms = unpack_32_1x128 (s);
5029 md = unpack_32_1x128 (d);
5031 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5033 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5043 m = *(uint32_t *) mask;
5047 xmm_src = load_128_unaligned ((__m128i*)src);
5049 if (m == 0xffffffff && is_opaque (xmm_src))
5051 save_128_aligned ((__m128i *)dst, xmm_src);
5055 xmm_dst = load_128_aligned ((__m128i *)dst);
5057 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5059 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5060 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5061 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5063 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5064 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5066 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5067 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5069 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5084 m = (uint32_t) *mask++;
5091 if (sa == 0xff && m == 0xff)
5097 __m128i ms, md, ma, msa;
5099 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5100 ms = unpack_32_1x128 (s);
5101 md = unpack_32_1x128 (d);
5103 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5105 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5117 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5119 pixman_image_t * src_image,
5120 pixman_image_t * mask_image,
5121 pixman_image_t * dst_image,
5132 uint32_t *dst_line, *dst;
5134 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5135 __m128i xmm_dsta_hi, xmm_dsta_lo;
5139 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5144 PIXMAN_IMAGE_GET_LINE (
5145 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5147 xmm_src = expand_pixel_32_1x128 (src);
5153 dst_line += dst_stride;
5156 while (w && (unsigned long)dst & 15)
5160 vd = unpack_32_1x128 (*dst);
5162 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5170 __m128i tmp_lo, tmp_hi;
5172 xmm_dst = load_128_aligned ((__m128i*)dst);
5174 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5175 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5180 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5181 &xmm_dsta_lo, &xmm_dsta_hi,
5185 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5195 vd = unpack_32_1x128 (*dst);
5197 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5208 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5210 pixman_image_t * src_image,
5211 pixman_image_t * mask_image,
5212 pixman_image_t * dst_image,
5222 uint32_t *src, *src_line, s;
5223 uint32_t *dst, *dst_line, d;
5224 uint32_t *mask, *mask_line;
5226 int src_stride, mask_stride, dst_stride;
5229 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5230 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5231 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5233 PIXMAN_IMAGE_GET_LINE (
5234 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5235 PIXMAN_IMAGE_GET_LINE (
5236 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5237 PIXMAN_IMAGE_GET_LINE (
5238 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5243 src_line += src_stride;
5245 dst_line += dst_stride;
5247 mask_line += mask_stride;
5251 while (w && (unsigned long)dst & 15)
5256 m = (*mask++) >> 24;
5263 if (sa == 0xff && m == 0xff)
5269 __m128i ms, md, ma, msa;
5271 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5272 ms = unpack_32_1x128 (s);
5273 md = unpack_32_1x128 (d);
5275 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5277 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5287 xmm_mask = load_128_unaligned ((__m128i*)mask);
5289 if (!is_transparent (xmm_mask))
5291 xmm_src = load_128_unaligned ((__m128i*)src);
5293 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5295 save_128_aligned ((__m128i *)dst, xmm_src);
5299 xmm_dst = load_128_aligned ((__m128i *)dst);
5301 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5302 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5303 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5305 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5306 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5308 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5309 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5311 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5326 m = (*mask++) >> 24;
5333 if (sa == 0xff && m == 0xff)
5339 __m128i ms, md, ma, msa;
5341 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5342 ms = unpack_32_1x128 (s);
5343 md = unpack_32_1x128 (d);
5345 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5347 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5358 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5359 static force_inline void
5360 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5364 pixman_fixed_t unit_x,
5365 pixman_fixed_t max_vx,
5366 pixman_bool_t fully_transparent_src)
5369 const uint32_t* pm = NULL;
5371 __m128i xmm_dst_lo, xmm_dst_hi;
5372 __m128i xmm_src_lo, xmm_src_hi;
5373 __m128i xmm_alpha_lo, xmm_alpha_hi;
5375 if (fully_transparent_src)
5378 /* Align dst on a 16-byte boundary */
5379 while (w && ((unsigned long)pd & 15))
5382 s = combine1 (ps + (vx >> 16), pm);
5385 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5394 uint32_t tmp1, tmp2, tmp3, tmp4;
5396 tmp1 = ps[vx >> 16];
5398 tmp2 = ps[vx >> 16];
5400 tmp3 = ps[vx >> 16];
5402 tmp4 = ps[vx >> 16];
5405 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5407 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5409 if (is_opaque (xmm_src_hi))
5411 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5413 else if (!is_zero (xmm_src_hi))
5415 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5417 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5418 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5420 expand_alpha_2x128 (
5421 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5423 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5424 &xmm_alpha_lo, &xmm_alpha_hi,
5425 &xmm_dst_lo, &xmm_dst_hi);
5427 /* rebuid the 4 pixel data and save*/
5428 save_128_aligned ((__m128i*)pd,
5429 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5441 s = combine1 (ps + (vx >> 16), pm);
5444 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5452 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5453 scaled_nearest_scanline_sse2_8888_8888_OVER,
5454 uint32_t, uint32_t, COVER)
5455 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5456 scaled_nearest_scanline_sse2_8888_8888_OVER,
5457 uint32_t, uint32_t, NONE)
5458 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5459 scaled_nearest_scanline_sse2_8888_8888_OVER,
5460 uint32_t, uint32_t, PAD)
5462 static force_inline void
5463 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5465 const uint32_t * src,
5468 pixman_fixed_t unit_x,
5469 pixman_fixed_t max_vx,
5470 pixman_bool_t zero_src)
5473 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5474 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5475 __m128i xmm_alpha_lo, xmm_alpha_hi;
5477 if (zero_src || (*mask >> 24) == 0)
5480 xmm_mask = create_mask_16_128 (*mask >> 24);
5482 while (w && (unsigned long)dst & 15)
5484 uint32_t s = src[pixman_fixed_to_int (vx)];
5491 __m128i ms = unpack_32_1x128 (s);
5492 __m128i alpha = expand_alpha_1x128 (ms);
5493 __m128i dest = xmm_mask;
5494 __m128i alpha_dst = unpack_32_1x128 (d);
5496 *dst = pack_1x128_32 (
5497 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5505 uint32_t tmp1, tmp2, tmp3, tmp4;
5507 tmp1 = src[pixman_fixed_to_int (vx)];
5509 tmp2 = src[pixman_fixed_to_int (vx)];
5511 tmp3 = src[pixman_fixed_to_int (vx)];
5513 tmp4 = src[pixman_fixed_to_int (vx)];
5516 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5518 if (!is_zero (xmm_src))
5520 xmm_dst = load_128_aligned ((__m128i*)dst);
5522 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5523 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5524 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5525 &xmm_alpha_lo, &xmm_alpha_hi);
5527 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5528 &xmm_alpha_lo, &xmm_alpha_hi,
5529 &xmm_mask, &xmm_mask,
5530 &xmm_dst_lo, &xmm_dst_hi);
5533 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5542 uint32_t s = src[pixman_fixed_to_int (vx)];
5549 __m128i ms = unpack_32_1x128 (s);
5550 __m128i alpha = expand_alpha_1x128 (ms);
5551 __m128i mask = xmm_mask;
5552 __m128i dest = unpack_32_1x128 (d);
5554 *dst = pack_1x128_32 (
5555 in_over_1x128 (&ms, &alpha, &mask, &dest));
5564 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5565 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5566 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5567 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5568 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5569 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5570 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5571 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5572 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5574 static const pixman_fast_path_t sse2_fast_paths[] =
5576 /* PIXMAN_OP_OVER */
5577 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5578 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5579 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5580 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5581 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5582 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5583 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5584 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5585 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5586 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5587 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5588 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5589 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5590 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5591 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5592 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5593 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5594 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5595 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5596 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5597 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5598 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5599 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5600 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5601 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5602 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5603 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5604 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5605 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5606 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5607 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5608 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5609 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5610 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5611 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5612 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5613 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5614 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5615 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5616 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5617 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5618 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5619 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5620 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5621 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5622 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5624 /* PIXMAN_OP_OVER_REVERSE */
5625 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5626 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5629 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5630 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5631 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5632 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5633 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5634 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5637 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5638 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5639 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5640 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5641 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5642 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5643 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5644 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5645 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5646 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5647 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5648 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5649 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5650 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5653 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5654 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5655 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5657 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5658 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5659 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5660 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5661 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5662 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5663 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5664 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5665 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5666 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5667 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5668 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5670 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
5671 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
5672 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
5673 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
5678 static pixman_bool_t
5679 sse2_blt (pixman_implementation_t *imp,
5680 uint32_t * src_bits,
5681 uint32_t * dst_bits,
5693 if (!pixman_blt_sse2 (
5694 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5695 src_x, src_y, dst_x, dst_y, width, height))
5698 return _pixman_implementation_blt (
5700 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5701 src_x, src_y, dst_x, dst_y, width, height);
5707 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5708 __attribute__((__force_align_arg_pointer__))
5710 static pixman_bool_t
5711 sse2_fill (pixman_implementation_t *imp,
5721 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5723 return _pixman_implementation_fill (
5724 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5731 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5733 int w = iter->width;
5734 __m128i ff000000 = mask_ff000000;
5735 uint32_t *dst = iter->buffer;
5736 uint32_t *src = (uint32_t *)iter->bits;
5738 iter->bits += iter->stride;
5740 while (w && ((unsigned long)dst) & 0x0f)
5742 *dst++ = (*src++) | 0xff000000;
5749 (__m128i *)dst, _mm_or_si128 (
5750 load_128_unaligned ((__m128i *)src), ff000000));
5759 *dst++ = (*src++) | 0xff000000;
5763 return iter->buffer;
5767 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
5769 int w = iter->width;
5770 uint32_t *dst = iter->buffer;
5771 uint16_t *src = (uint16_t *)iter->bits;
5772 __m128i ff000000 = mask_ff000000;
5774 iter->bits += iter->stride;
5776 while (w && ((unsigned long)dst) & 0x0f)
5778 uint16_t s = *src++;
5780 *dst++ = CONVERT_0565_TO_8888 (s);
5788 s = _mm_loadu_si128 ((__m128i *)src);
5790 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
5791 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
5793 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
5794 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
5803 uint16_t s = *src++;
5805 *dst++ = CONVERT_0565_TO_8888 (s);
5809 return iter->buffer;
5813 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5815 int w = iter->width;
5816 uint32_t *dst = iter->buffer;
5817 uint8_t *src = iter->bits;
5818 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5820 iter->bits += iter->stride;
5822 while (w && (((unsigned long)dst) & 15))
5824 *dst++ = *(src++) << 24;
5830 xmm0 = _mm_loadu_si128((__m128i *)src);
5832 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
5833 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
5834 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
5835 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
5836 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
5837 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
5839 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
5840 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
5841 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
5842 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
5851 *dst++ = *(src++) << 24;
5855 return iter->buffer;
5860 pixman_format_code_t format;
5861 pixman_iter_get_scanline_t get_scanline;
5864 static const fetcher_info_t fetchers[] =
5866 { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
5867 { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
5868 { PIXMAN_a8, sse2_fetch_a8 },
5873 sse2_src_iter_init (pixman_implementation_t *imp,
5874 pixman_iter_t *iter,
5875 pixman_image_t *image,
5876 int x, int y, int width, int height,
5877 uint8_t *buffer, iter_flags_t flags)
5880 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
5882 if ((flags & ITER_NARROW) &&
5883 (image->common.flags & FLAGS) == FLAGS &&
5885 x + width <= image->bits.width &&
5886 y + height <= image->bits.height)
5888 const fetcher_info_t *f;
5890 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
5892 if (image->common.extended_format_code == f->format)
5894 uint8_t *b = (uint8_t *)image->bits.bits;
5895 int s = image->bits.rowstride * 4;
5897 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
5899 iter->width = width;
5900 iter->buffer = (uint32_t *)buffer;
5902 iter->get_scanline = f->get_scanline;
5908 _pixman_implementation_src_iter_init (
5909 imp->delegate, iter, image, x, y, width, height, buffer, flags);
5912 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5913 __attribute__((__force_align_arg_pointer__))
5915 pixman_implementation_t *
5916 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
5918 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
5920 /* SSE2 constants */
5921 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5922 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5923 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5924 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5925 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5926 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5927 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5928 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5929 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
5930 mask_0080 = create_mask_16_128 (0x0080);
5931 mask_00ff = create_mask_16_128 (0x00ff);
5932 mask_0101 = create_mask_16_128 (0x0101);
5933 mask_ffff = create_mask_16_128 (0xffff);
5934 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5935 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5937 /* Set up function pointers */
5938 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5939 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5940 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5941 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5942 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5943 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5944 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5945 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5946 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5947 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5949 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5951 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5952 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5953 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5954 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5955 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5956 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5957 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5958 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5959 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5960 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5961 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5963 imp->blt = sse2_blt;
5964 imp->fill = sse2_fill;
5966 imp->src_iter_init = sse2_src_iter_init;