2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
34 #include <emmintrin.h> /* for SSE2 intrinsics */
35 #include "pixman-private.h"
36 #include "pixman-combine32.h"
37 #include "pixman-inlines.h"
39 static __m128i mask_0080;
40 static __m128i mask_00ff;
41 static __m128i mask_0101;
42 static __m128i mask_ffff;
43 static __m128i mask_ff000000;
44 static __m128i mask_alpha;
46 static __m128i mask_565_r;
47 static __m128i mask_565_g1, mask_565_g2;
48 static __m128i mask_565_b;
49 static __m128i mask_red;
50 static __m128i mask_green;
51 static __m128i mask_blue;
53 static __m128i mask_565_fix_rb;
54 static __m128i mask_565_fix_g;
56 static force_inline __m128i
57 unpack_32_1x128 (uint32_t data)
59 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
62 static force_inline void
63 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
65 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
66 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
69 static force_inline __m128i
70 unpack_565_to_8888 (__m128i lo)
72 __m128i r, g, b, rb, t;
74 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
75 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
76 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
78 rb = _mm_or_si128 (r, b);
79 t = _mm_and_si128 (rb, mask_565_fix_rb);
80 t = _mm_srli_epi32 (t, 5);
81 rb = _mm_or_si128 (rb, t);
83 t = _mm_and_si128 (g, mask_565_fix_g);
84 t = _mm_srli_epi32 (t, 6);
85 g = _mm_or_si128 (g, t);
87 return _mm_or_si128 (rb, g);
90 static force_inline void
91 unpack_565_128_4x128 (__m128i data,
99 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
100 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
102 lo = unpack_565_to_8888 (lo);
103 hi = unpack_565_to_8888 (hi);
105 unpack_128_2x128 (lo, data0, data1);
106 unpack_128_2x128 (hi, data2, data3);
109 static force_inline uint16_t
110 pack_565_32_16 (uint32_t pixel)
112 return (uint16_t) (((pixel >> 8) & 0xf800) |
113 ((pixel >> 5) & 0x07e0) |
114 ((pixel >> 3) & 0x001f));
117 static force_inline __m128i
118 pack_2x128_128 (__m128i lo, __m128i hi)
120 return _mm_packus_epi16 (lo, hi);
123 static force_inline __m128i
124 pack_565_2x128_128 (__m128i lo, __m128i hi)
127 __m128i r, g1, g2, b;
129 data = pack_2x128_128 (lo, hi);
131 r = _mm_and_si128 (data, mask_565_r);
132 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
133 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
134 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
136 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
139 static force_inline __m128i
140 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
142 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
143 pack_565_2x128_128 (*xmm2, *xmm3));
146 static force_inline int
147 is_opaque (__m128i x)
149 __m128i ffs = _mm_cmpeq_epi8 (x, x);
151 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
154 static force_inline int
157 return _mm_movemask_epi8 (
158 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
161 static force_inline int
162 is_transparent (__m128i x)
164 return (_mm_movemask_epi8 (
165 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
168 static force_inline __m128i
169 expand_pixel_32_1x128 (uint32_t data)
171 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
174 static force_inline __m128i
175 expand_alpha_1x128 (__m128i data)
177 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
178 _MM_SHUFFLE (3, 3, 3, 3)),
179 _MM_SHUFFLE (3, 3, 3, 3));
182 static force_inline void
183 expand_alpha_2x128 (__m128i data_lo,
190 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
191 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
193 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
194 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
197 static force_inline void
198 expand_alpha_rev_2x128 (__m128i data_lo,
205 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
206 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
207 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
208 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
211 static force_inline void
212 pix_multiply_2x128 (__m128i* data_lo,
221 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
222 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
223 lo = _mm_adds_epu16 (lo, mask_0080);
224 hi = _mm_adds_epu16 (hi, mask_0080);
225 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
226 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
229 static force_inline void
230 pix_add_multiply_2x128 (__m128i* src_lo,
232 __m128i* alpha_dst_lo,
233 __m128i* alpha_dst_hi,
236 __m128i* alpha_src_lo,
237 __m128i* alpha_src_hi,
241 __m128i t1_lo, t1_hi;
242 __m128i t2_lo, t2_hi;
244 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
245 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
247 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
248 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
251 static force_inline void
252 negate_2x128 (__m128i data_lo,
257 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
258 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
261 static force_inline void
262 invert_colors_2x128 (__m128i data_lo,
269 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
270 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
271 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
272 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
275 static force_inline void
276 over_2x128 (__m128i* src_lo,
285 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
287 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
289 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
290 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
293 static force_inline void
294 over_rev_non_pre_2x128 (__m128i src_lo,
300 __m128i alpha_lo, alpha_hi;
302 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
304 lo = _mm_or_si128 (alpha_lo, mask_alpha);
305 hi = _mm_or_si128 (alpha_hi, mask_alpha);
307 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
309 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
311 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
314 static force_inline void
315 in_over_2x128 (__m128i* src_lo,
327 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
328 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
330 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
333 /* load 4 pixels from a 16-byte boundary aligned address */
334 static force_inline __m128i
335 load_128_aligned (__m128i* src)
337 return _mm_load_si128 (src);
340 /* load 4 pixels from a unaligned address */
341 static force_inline __m128i
342 load_128_unaligned (const __m128i* src)
344 return _mm_loadu_si128 (src);
347 /* save 4 pixels using Write Combining memory on a 16-byte
348 * boundary aligned address
350 static force_inline void
351 save_128_write_combining (__m128i* dst,
354 _mm_stream_si128 (dst, data);
357 /* save 4 pixels on a 16-byte boundary aligned address */
358 static force_inline void
359 save_128_aligned (__m128i* dst,
362 _mm_store_si128 (dst, data);
365 /* save 4 pixels on a unaligned address */
366 static force_inline void
367 save_128_unaligned (__m128i* dst,
370 _mm_storeu_si128 (dst, data);
373 static force_inline __m128i
374 load_32_1x128 (uint32_t data)
376 return _mm_cvtsi32_si128 (data);
379 static force_inline __m128i
380 expand_alpha_rev_1x128 (__m128i data)
382 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
385 static force_inline __m128i
386 expand_pixel_8_1x128 (uint8_t data)
388 return _mm_shufflelo_epi16 (
389 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
392 static force_inline __m128i
393 pix_multiply_1x128 (__m128i data,
396 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
401 static force_inline __m128i
402 pix_add_multiply_1x128 (__m128i* src,
407 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
408 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
410 return _mm_adds_epu8 (t1, t2);
413 static force_inline __m128i
414 negate_1x128 (__m128i data)
416 return _mm_xor_si128 (data, mask_00ff);
419 static force_inline __m128i
420 invert_colors_1x128 (__m128i data)
422 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
425 static force_inline __m128i
426 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
428 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
431 static force_inline __m128i
432 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
434 return over_1x128 (pix_multiply_1x128 (*src, *mask),
435 pix_multiply_1x128 (*alpha, *mask),
439 static force_inline __m128i
440 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
442 __m128i alpha = expand_alpha_1x128 (src);
444 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
445 _mm_or_si128 (alpha, mask_alpha)),
450 static force_inline uint32_t
451 pack_1x128_32 (__m128i data)
453 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
456 static force_inline __m128i
457 expand565_16_1x128 (uint16_t pixel)
459 __m128i m = _mm_cvtsi32_si128 (pixel);
461 m = unpack_565_to_8888 (m);
463 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
466 static force_inline uint32_t
467 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
480 xmms = unpack_32_1x128 (src);
481 return pack_1x128_32 (
482 over_1x128 (xmms, expand_alpha_1x128 (xmms),
483 unpack_32_1x128 (dst)));
489 static force_inline uint32_t
490 combine1 (const uint32_t *ps, const uint32_t *pm)
498 mm = unpack_32_1x128 (*pm);
499 mm = expand_alpha_1x128 (mm);
501 ms = unpack_32_1x128 (s);
502 ms = pix_multiply_1x128 (ms, mm);
504 s = pack_1x128_32 (ms);
510 static force_inline __m128i
511 combine4 (const __m128i *ps, const __m128i *pm)
513 __m128i xmm_src_lo, xmm_src_hi;
514 __m128i xmm_msk_lo, xmm_msk_hi;
519 xmm_msk_lo = load_128_unaligned (pm);
521 if (is_transparent (xmm_msk_lo))
522 return _mm_setzero_si128 ();
525 s = load_128_unaligned (ps);
529 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
530 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
532 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
534 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
535 &xmm_msk_lo, &xmm_msk_hi,
536 &xmm_src_lo, &xmm_src_hi);
538 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
544 static force_inline void
545 core_combine_over_u_sse2_mask (uint32_t * pd,
552 /* Align dst on a 16-byte boundary */
553 while (w && ((unsigned long)pd & 15))
556 s = combine1 (ps, pm);
559 *pd = core_combine_over_u_pixel_sse2 (s, d);
568 __m128i mask = load_128_unaligned ((__m128i *)pm);
573 __m128i src_hi, src_lo;
574 __m128i mask_hi, mask_lo;
575 __m128i alpha_hi, alpha_lo;
577 src = load_128_unaligned ((__m128i *)ps);
579 if (is_opaque (_mm_and_si128 (src, mask)))
581 save_128_aligned ((__m128i *)pd, src);
585 __m128i dst = load_128_aligned ((__m128i *)pd);
586 __m128i dst_hi, dst_lo;
588 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
589 unpack_128_2x128 (src, &src_lo, &src_hi);
591 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
592 pix_multiply_2x128 (&src_lo, &src_hi,
596 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
598 expand_alpha_2x128 (src_lo, src_hi,
599 &alpha_lo, &alpha_hi);
601 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
606 pack_2x128_128 (dst_lo, dst_hi));
618 s = combine1 (ps, pm);
621 *pd = core_combine_over_u_pixel_sse2 (s, d);
630 static force_inline void
631 core_combine_over_u_sse2_no_mask (uint32_t * pd,
637 /* Align dst on a 16-byte boundary */
638 while (w && ((unsigned long)pd & 15))
644 *pd = core_combine_over_u_pixel_sse2 (s, d);
653 __m128i src_hi, src_lo, dst_hi, dst_lo;
654 __m128i alpha_hi, alpha_lo;
656 src = load_128_unaligned ((__m128i *)ps);
662 save_128_aligned ((__m128i *)pd, src);
666 __m128i dst = load_128_aligned ((__m128i *)pd);
668 unpack_128_2x128 (src, &src_lo, &src_hi);
669 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
671 expand_alpha_2x128 (src_lo, src_hi,
672 &alpha_lo, &alpha_hi);
673 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
678 pack_2x128_128 (dst_lo, dst_hi));
692 *pd = core_combine_over_u_pixel_sse2 (s, d);
700 static force_inline void
701 sse2_combine_over_u (pixman_implementation_t *imp,
709 core_combine_over_u_sse2_mask (pd, ps, pm, w);
711 core_combine_over_u_sse2_no_mask (pd, ps, w);
715 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
724 __m128i xmm_dst_lo, xmm_dst_hi;
725 __m128i xmm_src_lo, xmm_src_hi;
726 __m128i xmm_alpha_lo, xmm_alpha_hi;
728 /* Align dst on a 16-byte boundary */
730 ((unsigned long)pd & 15))
733 s = combine1 (ps, pm);
735 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
744 /* I'm loading unaligned because I'm not sure
745 * about the address alignment.
747 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
748 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
750 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
751 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
753 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
754 &xmm_alpha_lo, &xmm_alpha_hi);
756 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
757 &xmm_alpha_lo, &xmm_alpha_hi,
758 &xmm_src_lo, &xmm_src_hi);
760 /* rebuid the 4 pixel data and save*/
761 save_128_aligned ((__m128i*)pd,
762 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
775 s = combine1 (ps, pm);
777 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
785 static force_inline uint32_t
786 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
788 uint32_t maska = src >> 24;
794 else if (maska != 0xff)
796 return pack_1x128_32 (
797 pix_multiply_1x128 (unpack_32_1x128 (dst),
798 expand_alpha_1x128 (unpack_32_1x128 (src))));
805 sse2_combine_in_u (pixman_implementation_t *imp,
814 __m128i xmm_src_lo, xmm_src_hi;
815 __m128i xmm_dst_lo, xmm_dst_hi;
817 while (w && ((unsigned long) pd & 15))
819 s = combine1 (ps, pm);
822 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
831 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
832 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
834 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
835 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
837 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
838 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
839 &xmm_dst_lo, &xmm_dst_hi,
840 &xmm_dst_lo, &xmm_dst_hi);
842 save_128_aligned ((__m128i*)pd,
843 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
854 s = combine1 (ps, pm);
857 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
866 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
875 __m128i xmm_src_lo, xmm_src_hi;
876 __m128i xmm_dst_lo, xmm_dst_hi;
878 while (w && ((unsigned long) pd & 15))
880 s = combine1 (ps, pm);
883 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
892 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
893 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
895 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
896 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
898 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
899 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
900 &xmm_src_lo, &xmm_src_hi,
901 &xmm_dst_lo, &xmm_dst_hi);
904 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
915 s = combine1 (ps, pm);
918 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
927 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
934 while (w && ((unsigned long) pd & 15))
936 uint32_t s = combine1 (ps, pm);
939 *pd++ = pack_1x128_32 (
941 unpack_32_1x128 (d), negate_1x128 (
942 expand_alpha_1x128 (unpack_32_1x128 (s)))));
952 __m128i xmm_src_lo, xmm_src_hi;
953 __m128i xmm_dst_lo, xmm_dst_hi;
955 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
956 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
958 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
959 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
961 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
962 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
964 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
965 &xmm_src_lo, &xmm_src_hi,
966 &xmm_dst_lo, &xmm_dst_hi);
969 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
981 uint32_t s = combine1 (ps, pm);
984 *pd++ = pack_1x128_32 (
986 unpack_32_1x128 (d), negate_1x128 (
987 expand_alpha_1x128 (unpack_32_1x128 (s)))));
996 sse2_combine_out_u (pixman_implementation_t *imp,
1000 const uint32_t * pm,
1003 while (w && ((unsigned long) pd & 15))
1005 uint32_t s = combine1 (ps, pm);
1008 *pd++ = pack_1x128_32 (
1009 pix_multiply_1x128 (
1010 unpack_32_1x128 (s), negate_1x128 (
1011 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1020 __m128i xmm_src_lo, xmm_src_hi;
1021 __m128i xmm_dst_lo, xmm_dst_hi;
1023 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1024 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1026 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1027 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1029 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1030 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1032 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1033 &xmm_dst_lo, &xmm_dst_hi,
1034 &xmm_dst_lo, &xmm_dst_hi);
1037 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1048 uint32_t s = combine1 (ps, pm);
1051 *pd++ = pack_1x128_32 (
1052 pix_multiply_1x128 (
1053 unpack_32_1x128 (s), negate_1x128 (
1054 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1062 static force_inline uint32_t
1063 core_combine_atop_u_pixel_sse2 (uint32_t src,
1066 __m128i s = unpack_32_1x128 (src);
1067 __m128i d = unpack_32_1x128 (dst);
1069 __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1070 __m128i da = expand_alpha_1x128 (d);
1072 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1076 sse2_combine_atop_u (pixman_implementation_t *imp,
1079 const uint32_t * ps,
1080 const uint32_t * pm,
1085 __m128i xmm_src_lo, xmm_src_hi;
1086 __m128i xmm_dst_lo, xmm_dst_hi;
1087 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1088 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1090 while (w && ((unsigned long) pd & 15))
1092 s = combine1 (ps, pm);
1095 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1104 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1105 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1107 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1108 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1110 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1111 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1112 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1113 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1115 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1116 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1118 pix_add_multiply_2x128 (
1119 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1120 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1121 &xmm_dst_lo, &xmm_dst_hi);
1124 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1135 s = combine1 (ps, pm);
1138 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1146 static force_inline uint32_t
1147 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1150 __m128i s = unpack_32_1x128 (src);
1151 __m128i d = unpack_32_1x128 (dst);
1153 __m128i sa = expand_alpha_1x128 (s);
1154 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1156 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1160 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1163 const uint32_t * ps,
1164 const uint32_t * pm,
1169 __m128i xmm_src_lo, xmm_src_hi;
1170 __m128i xmm_dst_lo, xmm_dst_hi;
1171 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1172 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1174 while (w && ((unsigned long) pd & 15))
1176 s = combine1 (ps, pm);
1179 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1188 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1189 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1191 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1192 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1194 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1195 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1196 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1197 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1199 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1200 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1202 pix_add_multiply_2x128 (
1203 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1204 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1205 &xmm_dst_lo, &xmm_dst_hi);
1208 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1219 s = combine1 (ps, pm);
1222 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1230 static force_inline uint32_t
1231 core_combine_xor_u_pixel_sse2 (uint32_t src,
1234 __m128i s = unpack_32_1x128 (src);
1235 __m128i d = unpack_32_1x128 (dst);
1237 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1238 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1240 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1244 sse2_combine_xor_u (pixman_implementation_t *imp,
1247 const uint32_t * src,
1248 const uint32_t * mask,
1254 const uint32_t* ps = src;
1255 const uint32_t* pm = mask;
1257 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1258 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1259 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1260 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1262 while (w && ((unsigned long) pd & 15))
1264 s = combine1 (ps, pm);
1267 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1276 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1277 xmm_dst = load_128_aligned ((__m128i*) pd);
1279 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1280 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1282 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1283 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1284 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1285 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1287 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1288 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1289 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1290 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1292 pix_add_multiply_2x128 (
1293 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1294 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1295 &xmm_dst_lo, &xmm_dst_hi);
1298 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1309 s = combine1 (ps, pm);
1312 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1320 static force_inline void
1321 sse2_combine_add_u (pixman_implementation_t *imp,
1324 const uint32_t * src,
1325 const uint32_t * mask,
1331 const uint32_t* ps = src;
1332 const uint32_t* pm = mask;
1334 while (w && (unsigned long)pd & 15)
1336 s = combine1 (ps, pm);
1342 *pd++ = _mm_cvtsi128_si32 (
1343 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1351 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1354 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1365 s = combine1 (ps, pm);
1369 *pd++ = _mm_cvtsi128_si32 (
1370 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1376 static force_inline uint32_t
1377 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1380 __m128i ms = unpack_32_1x128 (src);
1381 __m128i md = unpack_32_1x128 (dst);
1382 uint32_t sa = src >> 24;
1383 uint32_t da = ~dst >> 24;
1387 ms = pix_multiply_1x128 (
1388 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1391 return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1395 sse2_combine_saturate_u (pixman_implementation_t *imp,
1398 const uint32_t * ps,
1399 const uint32_t * pm,
1405 __m128i xmm_src, xmm_dst;
1407 while (w && (unsigned long)pd & 15)
1409 s = combine1 (ps, pm);
1412 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1421 xmm_dst = load_128_aligned ((__m128i*)pd);
1422 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1424 pack_cmp = _mm_movemask_epi8 (
1426 _mm_srli_epi32 (xmm_src, 24),
1427 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1429 /* if some alpha src is grater than respective ~alpha dst */
1432 s = combine1 (ps++, pm);
1434 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1438 s = combine1 (ps++, pm);
1440 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1444 s = combine1 (ps++, pm);
1446 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1450 s = combine1 (ps++, pm);
1452 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1458 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1471 s = combine1 (ps, pm);
1474 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1482 sse2_combine_src_ca (pixman_implementation_t *imp,
1485 const uint32_t * ps,
1486 const uint32_t * pm,
1491 __m128i xmm_src_lo, xmm_src_hi;
1492 __m128i xmm_mask_lo, xmm_mask_hi;
1493 __m128i xmm_dst_lo, xmm_dst_hi;
1495 while (w && (unsigned long)pd & 15)
1499 *pd++ = pack_1x128_32 (
1500 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1506 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1507 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1509 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1510 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1512 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1513 &xmm_mask_lo, &xmm_mask_hi,
1514 &xmm_dst_lo, &xmm_dst_hi);
1517 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1529 *pd++ = pack_1x128_32 (
1530 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1535 static force_inline uint32_t
1536 core_combine_over_ca_pixel_sse2 (uint32_t src,
1540 __m128i s = unpack_32_1x128 (src);
1541 __m128i expAlpha = expand_alpha_1x128 (s);
1542 __m128i unpk_mask = unpack_32_1x128 (mask);
1543 __m128i unpk_dst = unpack_32_1x128 (dst);
1545 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1549 sse2_combine_over_ca (pixman_implementation_t *imp,
1552 const uint32_t * ps,
1553 const uint32_t * pm,
1558 __m128i xmm_alpha_lo, xmm_alpha_hi;
1559 __m128i xmm_src_lo, xmm_src_hi;
1560 __m128i xmm_dst_lo, xmm_dst_hi;
1561 __m128i xmm_mask_lo, xmm_mask_hi;
1563 while (w && (unsigned long)pd & 15)
1569 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1575 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1576 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1577 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1579 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1580 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1581 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1583 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1584 &xmm_alpha_lo, &xmm_alpha_hi);
1586 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1587 &xmm_alpha_lo, &xmm_alpha_hi,
1588 &xmm_mask_lo, &xmm_mask_hi,
1589 &xmm_dst_lo, &xmm_dst_hi);
1592 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1606 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1611 static force_inline uint32_t
1612 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1616 __m128i d = unpack_32_1x128 (dst);
1618 return pack_1x128_32 (
1619 over_1x128 (d, expand_alpha_1x128 (d),
1620 pix_multiply_1x128 (unpack_32_1x128 (src),
1621 unpack_32_1x128 (mask))));
1625 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1628 const uint32_t * ps,
1629 const uint32_t * pm,
1634 __m128i xmm_alpha_lo, xmm_alpha_hi;
1635 __m128i xmm_src_lo, xmm_src_hi;
1636 __m128i xmm_dst_lo, xmm_dst_hi;
1637 __m128i xmm_mask_lo, xmm_mask_hi;
1639 while (w && (unsigned long)pd & 15)
1645 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1651 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1652 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1653 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1655 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1656 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1657 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1659 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1660 &xmm_alpha_lo, &xmm_alpha_hi);
1661 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1662 &xmm_mask_lo, &xmm_mask_hi,
1663 &xmm_mask_lo, &xmm_mask_hi);
1665 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1666 &xmm_alpha_lo, &xmm_alpha_hi,
1667 &xmm_mask_lo, &xmm_mask_hi);
1670 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1684 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1690 sse2_combine_in_ca (pixman_implementation_t *imp,
1693 const uint32_t * ps,
1694 const uint32_t * pm,
1699 __m128i xmm_alpha_lo, xmm_alpha_hi;
1700 __m128i xmm_src_lo, xmm_src_hi;
1701 __m128i xmm_dst_lo, xmm_dst_hi;
1702 __m128i xmm_mask_lo, xmm_mask_hi;
1704 while (w && (unsigned long)pd & 15)
1710 *pd++ = pack_1x128_32 (
1711 pix_multiply_1x128 (
1712 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1713 expand_alpha_1x128 (unpack_32_1x128 (d))));
1720 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1721 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1722 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1724 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1725 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1726 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1728 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1729 &xmm_alpha_lo, &xmm_alpha_hi);
1731 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1732 &xmm_mask_lo, &xmm_mask_hi,
1733 &xmm_dst_lo, &xmm_dst_hi);
1735 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1736 &xmm_alpha_lo, &xmm_alpha_hi,
1737 &xmm_dst_lo, &xmm_dst_hi);
1740 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1754 *pd++ = pack_1x128_32 (
1755 pix_multiply_1x128 (
1756 pix_multiply_1x128 (
1757 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1758 expand_alpha_1x128 (unpack_32_1x128 (d))));
1765 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1768 const uint32_t * ps,
1769 const uint32_t * pm,
1774 __m128i xmm_alpha_lo, xmm_alpha_hi;
1775 __m128i xmm_src_lo, xmm_src_hi;
1776 __m128i xmm_dst_lo, xmm_dst_hi;
1777 __m128i xmm_mask_lo, xmm_mask_hi;
1779 while (w && (unsigned long)pd & 15)
1785 *pd++ = pack_1x128_32 (
1786 pix_multiply_1x128 (
1787 unpack_32_1x128 (d),
1788 pix_multiply_1x128 (unpack_32_1x128 (m),
1789 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1795 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1796 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1797 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1799 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1800 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1801 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1803 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1804 &xmm_alpha_lo, &xmm_alpha_hi);
1805 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1806 &xmm_alpha_lo, &xmm_alpha_hi,
1807 &xmm_alpha_lo, &xmm_alpha_hi);
1809 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1810 &xmm_alpha_lo, &xmm_alpha_hi,
1811 &xmm_dst_lo, &xmm_dst_hi);
1814 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1828 *pd++ = pack_1x128_32 (
1829 pix_multiply_1x128 (
1830 unpack_32_1x128 (d),
1831 pix_multiply_1x128 (unpack_32_1x128 (m),
1832 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1838 sse2_combine_out_ca (pixman_implementation_t *imp,
1841 const uint32_t * ps,
1842 const uint32_t * pm,
1847 __m128i xmm_alpha_lo, xmm_alpha_hi;
1848 __m128i xmm_src_lo, xmm_src_hi;
1849 __m128i xmm_dst_lo, xmm_dst_hi;
1850 __m128i xmm_mask_lo, xmm_mask_hi;
1852 while (w && (unsigned long)pd & 15)
1858 *pd++ = pack_1x128_32 (
1859 pix_multiply_1x128 (
1860 pix_multiply_1x128 (
1861 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1862 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1868 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1869 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1870 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1872 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1873 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1874 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1876 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1877 &xmm_alpha_lo, &xmm_alpha_hi);
1878 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1879 &xmm_alpha_lo, &xmm_alpha_hi);
1881 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1882 &xmm_mask_lo, &xmm_mask_hi,
1883 &xmm_dst_lo, &xmm_dst_hi);
1884 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1885 &xmm_alpha_lo, &xmm_alpha_hi,
1886 &xmm_dst_lo, &xmm_dst_hi);
1889 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1903 *pd++ = pack_1x128_32 (
1904 pix_multiply_1x128 (
1905 pix_multiply_1x128 (
1906 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1907 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1914 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1917 const uint32_t * ps,
1918 const uint32_t * pm,
1923 __m128i xmm_alpha_lo, xmm_alpha_hi;
1924 __m128i xmm_src_lo, xmm_src_hi;
1925 __m128i xmm_dst_lo, xmm_dst_hi;
1926 __m128i xmm_mask_lo, xmm_mask_hi;
1928 while (w && (unsigned long)pd & 15)
1934 *pd++ = pack_1x128_32 (
1935 pix_multiply_1x128 (
1936 unpack_32_1x128 (d),
1937 negate_1x128 (pix_multiply_1x128 (
1938 unpack_32_1x128 (m),
1939 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1945 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1946 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1947 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1949 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1950 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1951 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1953 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1954 &xmm_alpha_lo, &xmm_alpha_hi);
1956 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1957 &xmm_alpha_lo, &xmm_alpha_hi,
1958 &xmm_mask_lo, &xmm_mask_hi);
1960 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1961 &xmm_mask_lo, &xmm_mask_hi);
1963 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1964 &xmm_mask_lo, &xmm_mask_hi,
1965 &xmm_dst_lo, &xmm_dst_hi);
1968 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1982 *pd++ = pack_1x128_32 (
1983 pix_multiply_1x128 (
1984 unpack_32_1x128 (d),
1985 negate_1x128 (pix_multiply_1x128 (
1986 unpack_32_1x128 (m),
1987 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1992 static force_inline uint32_t
1993 core_combine_atop_ca_pixel_sse2 (uint32_t src,
1997 __m128i m = unpack_32_1x128 (mask);
1998 __m128i s = unpack_32_1x128 (src);
1999 __m128i d = unpack_32_1x128 (dst);
2000 __m128i sa = expand_alpha_1x128 (s);
2001 __m128i da = expand_alpha_1x128 (d);
2003 s = pix_multiply_1x128 (s, m);
2004 m = negate_1x128 (pix_multiply_1x128 (m, sa));
2006 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2010 sse2_combine_atop_ca (pixman_implementation_t *imp,
2013 const uint32_t * ps,
2014 const uint32_t * pm,
2019 __m128i xmm_src_lo, xmm_src_hi;
2020 __m128i xmm_dst_lo, xmm_dst_hi;
2021 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2022 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2023 __m128i xmm_mask_lo, xmm_mask_hi;
2025 while (w && (unsigned long)pd & 15)
2031 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2037 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2038 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2039 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2041 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2042 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2043 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2045 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2046 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2047 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2048 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2050 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2051 &xmm_mask_lo, &xmm_mask_hi,
2052 &xmm_src_lo, &xmm_src_hi);
2053 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2054 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2055 &xmm_mask_lo, &xmm_mask_hi);
2057 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2059 pix_add_multiply_2x128 (
2060 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2061 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2062 &xmm_dst_lo, &xmm_dst_hi);
2065 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2079 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2084 static force_inline uint32_t
2085 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2089 __m128i m = unpack_32_1x128 (mask);
2090 __m128i s = unpack_32_1x128 (src);
2091 __m128i d = unpack_32_1x128 (dst);
2093 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2094 __m128i sa = expand_alpha_1x128 (s);
2096 s = pix_multiply_1x128 (s, m);
2097 m = pix_multiply_1x128 (m, sa);
2099 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2103 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2106 const uint32_t * ps,
2107 const uint32_t * pm,
2112 __m128i xmm_src_lo, xmm_src_hi;
2113 __m128i xmm_dst_lo, xmm_dst_hi;
2114 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2115 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2116 __m128i xmm_mask_lo, xmm_mask_hi;
2118 while (w && (unsigned long)pd & 15)
2124 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2130 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2131 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2132 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2134 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2135 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2136 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2138 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2139 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2140 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2141 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2143 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2144 &xmm_mask_lo, &xmm_mask_hi,
2145 &xmm_src_lo, &xmm_src_hi);
2146 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2147 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2148 &xmm_mask_lo, &xmm_mask_hi);
2150 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2151 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2153 pix_add_multiply_2x128 (
2154 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2155 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2156 &xmm_dst_lo, &xmm_dst_hi);
2159 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2173 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2178 static force_inline uint32_t
2179 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2183 __m128i a = unpack_32_1x128 (mask);
2184 __m128i s = unpack_32_1x128 (src);
2185 __m128i d = unpack_32_1x128 (dst);
2187 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2188 a, expand_alpha_1x128 (s)));
2189 __m128i dest = pix_multiply_1x128 (s, a);
2190 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2192 return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2199 sse2_combine_xor_ca (pixman_implementation_t *imp,
2202 const uint32_t * ps,
2203 const uint32_t * pm,
2208 __m128i xmm_src_lo, xmm_src_hi;
2209 __m128i xmm_dst_lo, xmm_dst_hi;
2210 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2211 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2212 __m128i xmm_mask_lo, xmm_mask_hi;
2214 while (w && (unsigned long)pd & 15)
2220 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2226 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2227 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2228 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2230 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2231 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2232 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2234 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2235 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2236 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2237 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2239 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2240 &xmm_mask_lo, &xmm_mask_hi,
2241 &xmm_src_lo, &xmm_src_hi);
2242 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2243 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2244 &xmm_mask_lo, &xmm_mask_hi);
2246 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2247 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2248 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2249 &xmm_mask_lo, &xmm_mask_hi);
2251 pix_add_multiply_2x128 (
2252 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2253 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2254 &xmm_dst_lo, &xmm_dst_hi);
2257 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2271 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2277 sse2_combine_add_ca (pixman_implementation_t *imp,
2280 const uint32_t * ps,
2281 const uint32_t * pm,
2286 __m128i xmm_src_lo, xmm_src_hi;
2287 __m128i xmm_dst_lo, xmm_dst_hi;
2288 __m128i xmm_mask_lo, xmm_mask_hi;
2290 while (w && (unsigned long)pd & 15)
2296 *pd++ = pack_1x128_32 (
2297 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2298 unpack_32_1x128 (m)),
2299 unpack_32_1x128 (d)));
2305 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2306 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2307 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2309 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2310 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2311 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2313 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2314 &xmm_mask_lo, &xmm_mask_hi,
2315 &xmm_src_lo, &xmm_src_hi);
2318 (__m128i*)pd, pack_2x128_128 (
2319 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2320 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2334 *pd++ = pack_1x128_32 (
2335 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2336 unpack_32_1x128 (m)),
2337 unpack_32_1x128 (d)));
2342 static force_inline __m128i
2343 create_mask_16_128 (uint16_t mask)
2345 return _mm_set1_epi16 (mask);
2348 /* Work around a code generation bug in Sun Studio 12. */
2349 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2350 # define create_mask_2x32_128(mask0, mask1) \
2351 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2353 static force_inline __m128i
2354 create_mask_2x32_128 (uint32_t mask0,
2357 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2362 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2363 pixman_composite_info_t *info)
2365 PIXMAN_COMPOSITE_ARGS (info);
2367 uint32_t *dst_line, *dst, d;
2370 __m128i xmm_src, xmm_alpha;
2371 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2373 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2378 PIXMAN_IMAGE_GET_LINE (
2379 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2381 xmm_src = expand_pixel_32_1x128 (src);
2382 xmm_alpha = expand_alpha_1x128 (xmm_src);
2388 dst_line += dst_stride;
2391 while (w && (unsigned long)dst & 15)
2394 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2396 unpack_32_1x128 (d)));
2402 xmm_dst = load_128_aligned ((__m128i*)dst);
2404 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2406 over_2x128 (&xmm_src, &xmm_src,
2407 &xmm_alpha, &xmm_alpha,
2408 &xmm_dst_lo, &xmm_dst_hi);
2410 /* rebuid the 4 pixel data and save*/
2412 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2421 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2423 unpack_32_1x128 (d)));
2431 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2432 pixman_composite_info_t *info)
2434 PIXMAN_COMPOSITE_ARGS (info);
2436 uint16_t *dst_line, *dst, d;
2439 __m128i xmm_src, xmm_alpha;
2440 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2442 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2447 PIXMAN_IMAGE_GET_LINE (
2448 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2450 xmm_src = expand_pixel_32_1x128 (src);
2451 xmm_alpha = expand_alpha_1x128 (xmm_src);
2457 dst_line += dst_stride;
2460 while (w && (unsigned long)dst & 15)
2464 *dst++ = pack_565_32_16 (
2465 pack_1x128_32 (over_1x128 (xmm_src,
2467 expand565_16_1x128 (d))));
2473 xmm_dst = load_128_aligned ((__m128i*)dst);
2475 unpack_565_128_4x128 (xmm_dst,
2476 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2478 over_2x128 (&xmm_src, &xmm_src,
2479 &xmm_alpha, &xmm_alpha,
2480 &xmm_dst0, &xmm_dst1);
2481 over_2x128 (&xmm_src, &xmm_src,
2482 &xmm_alpha, &xmm_alpha,
2483 &xmm_dst2, &xmm_dst3);
2485 xmm_dst = pack_565_4x128_128 (
2486 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2488 save_128_aligned ((__m128i*)dst, xmm_dst);
2497 *dst++ = pack_565_32_16 (
2498 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2499 expand565_16_1x128 (d))));
2506 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2507 pixman_composite_info_t *info)
2509 PIXMAN_COMPOSITE_ARGS (info);
2511 uint32_t *dst_line, d;
2512 uint32_t *mask_line, m;
2514 int dst_stride, mask_stride;
2518 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2520 __m128i mmx_src, mmx_mask, mmx_dest;
2522 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2527 PIXMAN_IMAGE_GET_LINE (
2528 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2529 PIXMAN_IMAGE_GET_LINE (
2530 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2532 xmm_src = _mm_unpacklo_epi8 (
2533 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2539 const uint32_t *pm = (uint32_t *)mask_line;
2540 uint32_t *pd = (uint32_t *)dst_line;
2542 dst_line += dst_stride;
2543 mask_line += mask_stride;
2545 while (w && (unsigned long)pd & 15)
2553 mmx_mask = unpack_32_1x128 (m);
2554 mmx_dest = unpack_32_1x128 (d);
2556 *pd = pack_1x128_32 (
2557 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2567 xmm_mask = load_128_unaligned ((__m128i*)pm);
2571 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2573 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2574 if (pack_cmp != 0xffff)
2576 xmm_dst = load_128_aligned ((__m128i*)pd);
2578 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2580 pix_multiply_2x128 (&xmm_src, &xmm_src,
2581 &xmm_mask_lo, &xmm_mask_hi,
2582 &xmm_mask_lo, &xmm_mask_hi);
2583 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2586 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2602 mmx_mask = unpack_32_1x128 (m);
2603 mmx_dest = unpack_32_1x128 (d);
2605 *pd = pack_1x128_32 (
2606 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2618 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2619 pixman_composite_info_t *info)
2621 PIXMAN_COMPOSITE_ARGS (info);
2623 uint32_t *dst_line, d;
2624 uint32_t *mask_line, m;
2626 int dst_stride, mask_stride;
2628 __m128i xmm_src, xmm_alpha;
2629 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2630 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2632 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2634 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2639 PIXMAN_IMAGE_GET_LINE (
2640 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2641 PIXMAN_IMAGE_GET_LINE (
2642 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2644 xmm_src = _mm_unpacklo_epi8 (
2645 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2646 xmm_alpha = expand_alpha_1x128 (xmm_src);
2648 mmx_alpha = xmm_alpha;
2653 const uint32_t *pm = (uint32_t *)mask_line;
2654 uint32_t *pd = (uint32_t *)dst_line;
2656 dst_line += dst_stride;
2657 mask_line += mask_stride;
2659 while (w && (unsigned long)pd & 15)
2666 mmx_mask = unpack_32_1x128 (m);
2667 mmx_dest = unpack_32_1x128 (d);
2669 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2681 xmm_mask = load_128_unaligned ((__m128i*)pm);
2685 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2687 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2688 if (pack_cmp != 0xffff)
2690 xmm_dst = load_128_aligned ((__m128i*)pd);
2692 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2693 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2695 in_over_2x128 (&xmm_src, &xmm_src,
2696 &xmm_alpha, &xmm_alpha,
2697 &xmm_mask_lo, &xmm_mask_hi,
2698 &xmm_dst_lo, &xmm_dst_hi);
2701 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2716 mmx_mask = unpack_32_1x128 (m);
2717 mmx_dest = unpack_32_1x128 (d);
2719 *pd = pack_1x128_32 (
2720 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2731 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2732 pixman_composite_info_t *info)
2734 PIXMAN_COMPOSITE_ARGS (info);
2735 uint32_t *dst_line, *dst;
2736 uint32_t *src_line, *src;
2739 int dst_stride, src_stride;
2742 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2743 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2744 __m128i xmm_alpha_lo, xmm_alpha_hi;
2746 PIXMAN_IMAGE_GET_LINE (
2747 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2748 PIXMAN_IMAGE_GET_LINE (
2749 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2751 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2753 xmm_mask = create_mask_16_128 (mask >> 24);
2758 dst_line += dst_stride;
2760 src_line += src_stride;
2763 while (w && (unsigned long)dst & 15)
2765 uint32_t s = *src++;
2771 __m128i ms = unpack_32_1x128 (s);
2772 __m128i alpha = expand_alpha_1x128 (ms);
2773 __m128i dest = xmm_mask;
2774 __m128i alpha_dst = unpack_32_1x128 (d);
2776 *dst = pack_1x128_32 (
2777 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2785 xmm_src = load_128_unaligned ((__m128i*)src);
2787 if (!is_zero (xmm_src))
2789 xmm_dst = load_128_aligned ((__m128i*)dst);
2791 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2792 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2793 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2794 &xmm_alpha_lo, &xmm_alpha_hi);
2796 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2797 &xmm_alpha_lo, &xmm_alpha_hi,
2798 &xmm_mask, &xmm_mask,
2799 &xmm_dst_lo, &xmm_dst_hi);
2802 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2812 uint32_t s = *src++;
2818 __m128i ms = unpack_32_1x128 (s);
2819 __m128i alpha = expand_alpha_1x128 (ms);
2820 __m128i mask = xmm_mask;
2821 __m128i dest = unpack_32_1x128 (d);
2823 *dst = pack_1x128_32 (
2824 in_over_1x128 (&ms, &alpha, &mask, &dest));
2835 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2836 pixman_composite_info_t *info)
2838 PIXMAN_COMPOSITE_ARGS (info);
2839 uint32_t *dst_line, *dst;
2840 uint32_t *src_line, *src;
2842 int dst_stride, src_stride;
2845 PIXMAN_IMAGE_GET_LINE (
2846 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2847 PIXMAN_IMAGE_GET_LINE (
2848 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2853 dst_line += dst_stride;
2855 src_line += src_stride;
2858 while (w && (unsigned long)dst & 15)
2860 *dst++ = *src++ | 0xff000000;
2866 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2868 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2869 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2870 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2871 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2873 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2874 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2875 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2876 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2885 *dst++ = *src++ | 0xff000000;
2893 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2894 pixman_composite_info_t *info)
2896 PIXMAN_COMPOSITE_ARGS (info);
2897 uint32_t *dst_line, *dst;
2898 uint32_t *src_line, *src;
2900 int dst_stride, src_stride;
2903 __m128i xmm_mask, xmm_alpha;
2904 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2905 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2907 PIXMAN_IMAGE_GET_LINE (
2908 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2909 PIXMAN_IMAGE_GET_LINE (
2910 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2912 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2914 xmm_mask = create_mask_16_128 (mask >> 24);
2915 xmm_alpha = mask_00ff;
2920 dst_line += dst_stride;
2922 src_line += src_stride;
2925 while (w && (unsigned long)dst & 15)
2927 uint32_t s = (*src++) | 0xff000000;
2930 __m128i src = unpack_32_1x128 (s);
2931 __m128i alpha = xmm_alpha;
2932 __m128i mask = xmm_mask;
2933 __m128i dest = unpack_32_1x128 (d);
2935 *dst++ = pack_1x128_32 (
2936 in_over_1x128 (&src, &alpha, &mask, &dest));
2943 xmm_src = _mm_or_si128 (
2944 load_128_unaligned ((__m128i*)src), mask_ff000000);
2945 xmm_dst = load_128_aligned ((__m128i*)dst);
2947 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2948 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2950 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2951 &xmm_alpha, &xmm_alpha,
2952 &xmm_mask, &xmm_mask,
2953 &xmm_dst_lo, &xmm_dst_hi);
2956 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2966 uint32_t s = (*src++) | 0xff000000;
2969 __m128i src = unpack_32_1x128 (s);
2970 __m128i alpha = xmm_alpha;
2971 __m128i mask = xmm_mask;
2972 __m128i dest = unpack_32_1x128 (d);
2974 *dst++ = pack_1x128_32 (
2975 in_over_1x128 (&src, &alpha, &mask, &dest));
2984 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
2985 pixman_composite_info_t *info)
2987 PIXMAN_COMPOSITE_ARGS (info);
2988 int dst_stride, src_stride;
2989 uint32_t *dst_line, *dst;
2990 uint32_t *src_line, *src;
2992 PIXMAN_IMAGE_GET_LINE (
2993 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2994 PIXMAN_IMAGE_GET_LINE (
2995 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3002 sse2_combine_over_u (imp, op, dst, src, NULL, width);
3009 static force_inline uint16_t
3010 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3014 ms = unpack_32_1x128 (src);
3015 return pack_565_32_16 (
3018 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3022 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3023 pixman_composite_info_t *info)
3025 PIXMAN_COMPOSITE_ARGS (info);
3026 uint16_t *dst_line, *dst, d;
3027 uint32_t *src_line, *src, s;
3028 int dst_stride, src_stride;
3031 __m128i xmm_alpha_lo, xmm_alpha_hi;
3032 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3033 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3035 PIXMAN_IMAGE_GET_LINE (
3036 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3037 PIXMAN_IMAGE_GET_LINE (
3038 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3045 dst_line += dst_stride;
3046 src_line += src_stride;
3049 /* Align dst on a 16-byte boundary */
3051 ((unsigned long)dst & 15))
3056 *dst++ = composite_over_8888_0565pixel (s, d);
3060 /* It's a 8 pixel loop */
3063 /* I'm loading unaligned because I'm not sure
3064 * about the address alignment.
3066 xmm_src = load_128_unaligned ((__m128i*) src);
3067 xmm_dst = load_128_aligned ((__m128i*) dst);
3070 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3071 unpack_565_128_4x128 (xmm_dst,
3072 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3073 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3074 &xmm_alpha_lo, &xmm_alpha_hi);
3076 /* I'm loading next 4 pixels from memory
3077 * before to optimze the memory read.
3079 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3081 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3082 &xmm_alpha_lo, &xmm_alpha_hi,
3083 &xmm_dst0, &xmm_dst1);
3086 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3087 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3088 &xmm_alpha_lo, &xmm_alpha_hi);
3090 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3091 &xmm_alpha_lo, &xmm_alpha_hi,
3092 &xmm_dst2, &xmm_dst3);
3095 (__m128i*)dst, pack_565_4x128_128 (
3096 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3108 *dst++ = composite_over_8888_0565pixel (s, d);
3115 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3116 pixman_composite_info_t *info)
3118 PIXMAN_COMPOSITE_ARGS (info);
3120 uint32_t *dst_line, *dst;
3121 uint8_t *mask_line, *mask;
3122 int dst_stride, mask_stride;
3126 __m128i xmm_src, xmm_alpha, xmm_def;
3127 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3128 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3130 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3132 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3138 PIXMAN_IMAGE_GET_LINE (
3139 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3140 PIXMAN_IMAGE_GET_LINE (
3141 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3143 xmm_def = create_mask_2x32_128 (src, src);
3144 xmm_src = expand_pixel_32_1x128 (src);
3145 xmm_alpha = expand_alpha_1x128 (xmm_src);
3147 mmx_alpha = xmm_alpha;
3152 dst_line += dst_stride;
3154 mask_line += mask_stride;
3157 while (w && (unsigned long)dst & 15)
3159 uint8_t m = *mask++;
3164 mmx_mask = expand_pixel_8_1x128 (m);
3165 mmx_dest = unpack_32_1x128 (d);
3167 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3179 m = *((uint32_t*)mask);
3181 if (srca == 0xff && m == 0xffffffff)
3183 save_128_aligned ((__m128i*)dst, xmm_def);
3187 xmm_dst = load_128_aligned ((__m128i*) dst);
3188 xmm_mask = unpack_32_1x128 (m);
3189 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3192 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3193 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3195 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3196 &xmm_mask_lo, &xmm_mask_hi);
3198 in_over_2x128 (&xmm_src, &xmm_src,
3199 &xmm_alpha, &xmm_alpha,
3200 &xmm_mask_lo, &xmm_mask_hi,
3201 &xmm_dst_lo, &xmm_dst_hi);
3204 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3214 uint8_t m = *mask++;
3219 mmx_mask = expand_pixel_8_1x128 (m);
3220 mmx_dest = unpack_32_1x128 (d);
3222 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3235 static pixman_bool_t
3236 pixman_fill_sse2 (uint32_t *bits,
3245 uint32_t byte_width;
3255 stride = stride * (int) sizeof (uint32_t) / 1;
3256 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3262 data = (w << 16) | w;
3266 stride = stride * (int) sizeof (uint32_t) / 2;
3267 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3268 byte_width = 2 * width;
3271 data = (data & 0xffff) * 0x00010001;
3275 stride = stride * (int) sizeof (uint32_t) / 4;
3276 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3277 byte_width = 4 * width;
3285 xmm_def = create_mask_2x32_128 (data, data);
3290 uint8_t *d = byte_line;
3291 byte_line += stride;
3294 while (w >= 1 && ((unsigned long)d & 1))
3296 *(uint8_t *)d = data;
3301 while (w >= 2 && ((unsigned long)d & 3))
3303 *(uint16_t *)d = data;
3308 while (w >= 4 && ((unsigned long)d & 15))
3310 *(uint32_t *)d = data;
3318 save_128_aligned ((__m128i*)(d), xmm_def);
3319 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3320 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3321 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3322 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3323 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3324 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3325 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3333 save_128_aligned ((__m128i*)(d), xmm_def);
3334 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3335 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3336 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3344 save_128_aligned ((__m128i*)(d), xmm_def);
3345 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3353 save_128_aligned ((__m128i*)(d), xmm_def);
3361 *(uint32_t *)d = data;
3369 *(uint16_t *)d = data;
3376 *(uint8_t *)d = data;
3386 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3387 pixman_composite_info_t *info)
3389 PIXMAN_COMPOSITE_ARGS (info);
3391 uint32_t *dst_line, *dst;
3392 uint8_t *mask_line, *mask;
3393 int dst_stride, mask_stride;
3397 __m128i xmm_src, xmm_def;
3398 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3400 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3405 pixman_fill_sse2 (dest_image->bits.bits, dest_image->bits.rowstride,
3406 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3407 dest_x, dest_y, width, height, 0);
3411 PIXMAN_IMAGE_GET_LINE (
3412 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3413 PIXMAN_IMAGE_GET_LINE (
3414 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3416 xmm_def = create_mask_2x32_128 (src, src);
3417 xmm_src = expand_pixel_32_1x128 (src);
3422 dst_line += dst_stride;
3424 mask_line += mask_stride;
3427 while (w && (unsigned long)dst & 15)
3429 uint8_t m = *mask++;
3433 *dst = pack_1x128_32 (
3434 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3447 m = *((uint32_t*)mask);
3449 if (srca == 0xff && m == 0xffffffff)
3451 save_128_aligned ((__m128i*)dst, xmm_def);
3455 xmm_mask = unpack_32_1x128 (m);
3456 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3459 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3461 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3462 &xmm_mask_lo, &xmm_mask_hi);
3464 pix_multiply_2x128 (&xmm_src, &xmm_src,
3465 &xmm_mask_lo, &xmm_mask_hi,
3466 &xmm_mask_lo, &xmm_mask_hi);
3469 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3473 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3483 uint8_t m = *mask++;
3487 *dst = pack_1x128_32 (
3488 pix_multiply_1x128 (
3489 xmm_src, expand_pixel_8_1x128 (m)));
3504 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3505 pixman_composite_info_t *info)
3507 PIXMAN_COMPOSITE_ARGS (info);
3509 uint16_t *dst_line, *dst, d;
3510 uint8_t *mask_line, *mask;
3511 int dst_stride, mask_stride;
3514 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3516 __m128i xmm_src, xmm_alpha;
3517 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3518 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3520 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3525 PIXMAN_IMAGE_GET_LINE (
3526 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3527 PIXMAN_IMAGE_GET_LINE (
3528 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3530 xmm_src = expand_pixel_32_1x128 (src);
3531 xmm_alpha = expand_alpha_1x128 (xmm_src);
3533 mmx_alpha = xmm_alpha;
3538 dst_line += dst_stride;
3540 mask_line += mask_stride;
3543 while (w && (unsigned long)dst & 15)
3550 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3551 mmx_dest = expand565_16_1x128 (d);
3553 *dst = pack_565_32_16 (
3556 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3565 xmm_dst = load_128_aligned ((__m128i*) dst);
3566 unpack_565_128_4x128 (xmm_dst,
3567 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3569 m = *((uint32_t*)mask);
3574 xmm_mask = unpack_32_1x128 (m);
3575 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3578 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3580 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3581 &xmm_mask_lo, &xmm_mask_hi);
3583 in_over_2x128 (&xmm_src, &xmm_src,
3584 &xmm_alpha, &xmm_alpha,
3585 &xmm_mask_lo, &xmm_mask_hi,
3586 &xmm_dst0, &xmm_dst1);
3589 m = *((uint32_t*)mask);
3594 xmm_mask = unpack_32_1x128 (m);
3595 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3598 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3600 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3601 &xmm_mask_lo, &xmm_mask_hi);
3602 in_over_2x128 (&xmm_src, &xmm_src,
3603 &xmm_alpha, &xmm_alpha,
3604 &xmm_mask_lo, &xmm_mask_hi,
3605 &xmm_dst2, &xmm_dst3);
3609 (__m128i*)dst, pack_565_4x128_128 (
3610 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3623 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3624 mmx_dest = expand565_16_1x128 (d);
3626 *dst = pack_565_32_16 (
3629 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3640 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3641 pixman_composite_info_t *info)
3643 PIXMAN_COMPOSITE_ARGS (info);
3644 uint16_t *dst_line, *dst, d;
3645 uint32_t *src_line, *src, s;
3646 int dst_stride, src_stride;
3648 uint32_t opaque, zero;
3651 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3652 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3654 PIXMAN_IMAGE_GET_LINE (
3655 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3656 PIXMAN_IMAGE_GET_LINE (
3657 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3662 dst_line += dst_stride;
3664 src_line += src_stride;
3667 while (w && (unsigned long)dst & 15)
3672 ms = unpack_32_1x128 (s);
3674 *dst++ = pack_565_32_16 (
3676 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3683 xmm_src = load_128_unaligned ((__m128i*)src);
3684 xmm_dst = load_128_aligned ((__m128i*)dst);
3686 opaque = is_opaque (xmm_src);
3687 zero = is_zero (xmm_src);
3689 unpack_565_128_4x128 (xmm_dst,
3690 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3691 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3693 /* preload next round*/
3694 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3698 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3699 &xmm_dst0, &xmm_dst1);
3703 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3704 &xmm_dst0, &xmm_dst1);
3708 opaque = is_opaque (xmm_src);
3709 zero = is_zero (xmm_src);
3711 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3715 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3716 &xmm_dst2, &xmm_dst3);
3720 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3721 &xmm_dst2, &xmm_dst3);
3725 (__m128i*)dst, pack_565_4x128_128 (
3726 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3738 ms = unpack_32_1x128 (s);
3740 *dst++ = pack_565_32_16 (
3742 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3750 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3751 pixman_composite_info_t *info)
3753 PIXMAN_COMPOSITE_ARGS (info);
3754 uint32_t *dst_line, *dst, d;
3755 uint32_t *src_line, *src, s;
3756 int dst_stride, src_stride;
3758 uint32_t opaque, zero;
3760 __m128i xmm_src_lo, xmm_src_hi;
3761 __m128i xmm_dst_lo, xmm_dst_hi;
3763 PIXMAN_IMAGE_GET_LINE (
3764 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3765 PIXMAN_IMAGE_GET_LINE (
3766 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3771 dst_line += dst_stride;
3773 src_line += src_stride;
3776 while (w && (unsigned long)dst & 15)
3781 *dst++ = pack_1x128_32 (
3782 over_rev_non_pre_1x128 (
3783 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3790 xmm_src_hi = load_128_unaligned ((__m128i*)src);
3792 opaque = is_opaque (xmm_src_hi);
3793 zero = is_zero (xmm_src_hi);
3795 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3799 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3800 &xmm_dst_lo, &xmm_dst_hi);
3803 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3807 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
3809 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3811 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3812 &xmm_dst_lo, &xmm_dst_hi);
3815 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3828 *dst++ = pack_1x128_32 (
3829 over_rev_non_pre_1x128 (
3830 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3839 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3840 pixman_composite_info_t *info)
3842 PIXMAN_COMPOSITE_ARGS (info);
3844 uint16_t *dst_line, *dst, d;
3845 uint32_t *mask_line, *mask, m;
3846 int dst_stride, mask_stride;
3850 __m128i xmm_src, xmm_alpha;
3851 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3852 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3854 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3856 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3861 PIXMAN_IMAGE_GET_LINE (
3862 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3863 PIXMAN_IMAGE_GET_LINE (
3864 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3866 xmm_src = expand_pixel_32_1x128 (src);
3867 xmm_alpha = expand_alpha_1x128 (xmm_src);
3869 mmx_alpha = xmm_alpha;
3876 mask_line += mask_stride;
3877 dst_line += dst_stride;
3879 while (w && ((unsigned long)dst & 15))
3881 m = *(uint32_t *) mask;
3886 mmx_mask = unpack_32_1x128 (m);
3887 mmx_dest = expand565_16_1x128 (d);
3889 *dst = pack_565_32_16 (
3892 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3903 xmm_mask = load_128_unaligned ((__m128i*)mask);
3904 xmm_dst = load_128_aligned ((__m128i*)dst);
3906 pack_cmp = _mm_movemask_epi8 (
3907 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3909 unpack_565_128_4x128 (xmm_dst,
3910 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3911 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3913 /* preload next round */
3914 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
3916 /* preload next round */
3917 if (pack_cmp != 0xffff)
3919 in_over_2x128 (&xmm_src, &xmm_src,
3920 &xmm_alpha, &xmm_alpha,
3921 &xmm_mask_lo, &xmm_mask_hi,
3922 &xmm_dst0, &xmm_dst1);
3926 pack_cmp = _mm_movemask_epi8 (
3927 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3929 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3931 if (pack_cmp != 0xffff)
3933 in_over_2x128 (&xmm_src, &xmm_src,
3934 &xmm_alpha, &xmm_alpha,
3935 &xmm_mask_lo, &xmm_mask_hi,
3936 &xmm_dst2, &xmm_dst3);
3940 (__m128i*)dst, pack_565_4x128_128 (
3941 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3950 m = *(uint32_t *) mask;
3955 mmx_mask = unpack_32_1x128 (m);
3956 mmx_dest = expand565_16_1x128 (d);
3958 *dst = pack_565_32_16 (
3961 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3973 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
3974 pixman_composite_info_t *info)
3976 PIXMAN_COMPOSITE_ARGS (info);
3977 uint8_t *dst_line, *dst;
3978 uint8_t *mask_line, *mask;
3979 int dst_stride, mask_stride;
3985 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3986 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3988 PIXMAN_IMAGE_GET_LINE (
3989 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3990 PIXMAN_IMAGE_GET_LINE (
3991 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3993 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3995 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4000 dst_line += dst_stride;
4002 mask_line += mask_stride;
4005 while (w && ((unsigned long)dst & 15))
4007 m = (uint32_t) *mask++;
4008 d = (uint32_t) *dst;
4010 *dst++ = (uint8_t) pack_1x128_32 (
4011 pix_multiply_1x128 (
4012 pix_multiply_1x128 (xmm_alpha,
4013 unpack_32_1x128 (m)),
4014 unpack_32_1x128 (d)));
4020 xmm_mask = load_128_unaligned ((__m128i*)mask);
4021 xmm_dst = load_128_aligned ((__m128i*)dst);
4023 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4024 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4026 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4027 &xmm_mask_lo, &xmm_mask_hi,
4028 &xmm_mask_lo, &xmm_mask_hi);
4030 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4031 &xmm_dst_lo, &xmm_dst_hi,
4032 &xmm_dst_lo, &xmm_dst_hi);
4035 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4044 m = (uint32_t) *mask++;
4045 d = (uint32_t) *dst;
4047 *dst++ = (uint8_t) pack_1x128_32 (
4048 pix_multiply_1x128 (
4049 pix_multiply_1x128 (
4050 xmm_alpha, unpack_32_1x128 (m)),
4051 unpack_32_1x128 (d)));
4059 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4060 pixman_composite_info_t *info)
4062 PIXMAN_COMPOSITE_ARGS (info);
4063 uint8_t *dst_line, *dst;
4070 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4072 PIXMAN_IMAGE_GET_LINE (
4073 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4075 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4077 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4086 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4087 8, dest_x, dest_y, width, height, src);
4095 dst_line += dst_stride;
4098 while (w && ((unsigned long)dst & 15))
4100 d = (uint32_t) *dst;
4102 *dst++ = (uint8_t) pack_1x128_32 (
4103 pix_multiply_1x128 (
4105 unpack_32_1x128 (d)));
4111 xmm_dst = load_128_aligned ((__m128i*)dst);
4113 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4115 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4116 &xmm_dst_lo, &xmm_dst_hi,
4117 &xmm_dst_lo, &xmm_dst_hi);
4120 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4128 d = (uint32_t) *dst;
4130 *dst++ = (uint8_t) pack_1x128_32 (
4131 pix_multiply_1x128 (
4133 unpack_32_1x128 (d)));
4141 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4142 pixman_composite_info_t *info)
4144 PIXMAN_COMPOSITE_ARGS (info);
4145 uint8_t *dst_line, *dst;
4146 uint8_t *src_line, *src;
4147 int src_stride, dst_stride;
4151 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4152 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4154 PIXMAN_IMAGE_GET_LINE (
4155 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4156 PIXMAN_IMAGE_GET_LINE (
4157 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4162 dst_line += dst_stride;
4164 src_line += src_stride;
4167 while (w && ((unsigned long)dst & 15))
4169 s = (uint32_t) *src++;
4170 d = (uint32_t) *dst;
4172 *dst++ = (uint8_t) pack_1x128_32 (
4173 pix_multiply_1x128 (
4174 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4180 xmm_src = load_128_unaligned ((__m128i*)src);
4181 xmm_dst = load_128_aligned ((__m128i*)dst);
4183 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4184 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4186 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4187 &xmm_dst_lo, &xmm_dst_hi,
4188 &xmm_dst_lo, &xmm_dst_hi);
4191 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4200 s = (uint32_t) *src++;
4201 d = (uint32_t) *dst;
4203 *dst++ = (uint8_t) pack_1x128_32 (
4204 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4212 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4213 pixman_composite_info_t *info)
4215 PIXMAN_COMPOSITE_ARGS (info);
4216 uint8_t *dst_line, *dst;
4217 uint8_t *mask_line, *mask;
4218 int dst_stride, mask_stride;
4224 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4225 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4227 PIXMAN_IMAGE_GET_LINE (
4228 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4229 PIXMAN_IMAGE_GET_LINE (
4230 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4232 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4234 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4239 dst_line += dst_stride;
4241 mask_line += mask_stride;
4244 while (w && ((unsigned long)dst & 15))
4246 m = (uint32_t) *mask++;
4247 d = (uint32_t) *dst;
4249 *dst++ = (uint8_t) pack_1x128_32 (
4251 pix_multiply_1x128 (
4252 xmm_alpha, unpack_32_1x128 (m)),
4253 unpack_32_1x128 (d)));
4259 xmm_mask = load_128_unaligned ((__m128i*)mask);
4260 xmm_dst = load_128_aligned ((__m128i*)dst);
4262 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4263 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4265 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4266 &xmm_mask_lo, &xmm_mask_hi,
4267 &xmm_mask_lo, &xmm_mask_hi);
4269 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4270 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4273 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4282 m = (uint32_t) *mask++;
4283 d = (uint32_t) *dst;
4285 *dst++ = (uint8_t) pack_1x128_32 (
4287 pix_multiply_1x128 (
4288 xmm_alpha, unpack_32_1x128 (m)),
4289 unpack_32_1x128 (d)));
4298 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4299 pixman_composite_info_t *info)
4301 PIXMAN_COMPOSITE_ARGS (info);
4302 uint8_t *dst_line, *dst;
4309 PIXMAN_IMAGE_GET_LINE (
4310 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4312 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4321 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4322 8, dest_x, dest_y, width, height, 0xff);
4327 src = (src << 24) | (src << 16) | (src << 8) | src;
4328 xmm_src = _mm_set_epi32 (src, src, src, src);
4333 dst_line += dst_stride;
4336 while (w && ((unsigned long)dst & 15))
4338 *dst = (uint8_t)_mm_cvtsi128_si32 (
4341 _mm_cvtsi32_si128 (*dst)));
4350 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4358 *dst = (uint8_t)_mm_cvtsi128_si32 (
4361 _mm_cvtsi32_si128 (*dst)));
4371 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4372 pixman_composite_info_t *info)
4374 PIXMAN_COMPOSITE_ARGS (info);
4375 uint8_t *dst_line, *dst;
4376 uint8_t *src_line, *src;
4377 int dst_stride, src_stride;
4381 PIXMAN_IMAGE_GET_LINE (
4382 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4383 PIXMAN_IMAGE_GET_LINE (
4384 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4391 dst_line += dst_stride;
4392 src_line += src_stride;
4396 while (w && (unsigned long)dst & 3)
4398 t = (*dst) + (*src++);
4399 *dst++ = t | (0 - (t >> 8));
4403 sse2_combine_add_u (imp, op,
4404 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4414 t = (*dst) + (*src++);
4415 *dst++ = t | (0 - (t >> 8));
4423 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4424 pixman_composite_info_t *info)
4426 PIXMAN_COMPOSITE_ARGS (info);
4427 uint32_t *dst_line, *dst;
4428 uint32_t *src_line, *src;
4429 int dst_stride, src_stride;
4431 PIXMAN_IMAGE_GET_LINE (
4432 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4433 PIXMAN_IMAGE_GET_LINE (
4434 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4439 dst_line += dst_stride;
4441 src_line += src_stride;
4443 sse2_combine_add_u (imp, op, dst, src, NULL, width);
4448 static pixman_bool_t
4449 pixman_blt_sse2 (uint32_t *src_bits,
4462 uint8_t * src_bytes;
4463 uint8_t * dst_bytes;
4466 if (src_bpp != dst_bpp)
4471 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4472 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4473 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4474 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4475 byte_width = 2 * width;
4479 else if (src_bpp == 32)
4481 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4482 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4483 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4484 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4485 byte_width = 4 * width;
4497 uint8_t *s = src_bytes;
4498 uint8_t *d = dst_bytes;
4499 src_bytes += src_stride;
4500 dst_bytes += dst_stride;
4503 while (w >= 2 && ((unsigned long)d & 3))
4505 *(uint16_t *)d = *(uint16_t *)s;
4511 while (w >= 4 && ((unsigned long)d & 15))
4513 *(uint32_t *)d = *(uint32_t *)s;
4522 __m128i xmm0, xmm1, xmm2, xmm3;
4524 xmm0 = load_128_unaligned ((__m128i*)(s));
4525 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4526 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4527 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4529 save_128_aligned ((__m128i*)(d), xmm0);
4530 save_128_aligned ((__m128i*)(d + 16), xmm1);
4531 save_128_aligned ((__m128i*)(d + 32), xmm2);
4532 save_128_aligned ((__m128i*)(d + 48), xmm3);
4541 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4550 *(uint32_t *)d = *(uint32_t *)s;
4559 *(uint16_t *)d = *(uint16_t *)s;
4571 sse2_composite_copy_area (pixman_implementation_t *imp,
4572 pixman_composite_info_t *info)
4574 PIXMAN_COMPOSITE_ARGS (info);
4575 pixman_blt_sse2 (src_image->bits.bits,
4576 dest_image->bits.bits,
4577 src_image->bits.rowstride,
4578 dest_image->bits.rowstride,
4579 PIXMAN_FORMAT_BPP (src_image->bits.format),
4580 PIXMAN_FORMAT_BPP (dest_image->bits.format),
4581 src_x, src_y, dest_x, dest_y, width, height);
4585 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4586 pixman_composite_info_t *info)
4588 PIXMAN_COMPOSITE_ARGS (info);
4589 uint32_t *src, *src_line, s;
4590 uint32_t *dst, *dst_line, d;
4591 uint8_t *mask, *mask_line;
4593 int src_stride, mask_stride, dst_stride;
4597 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4598 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4599 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4601 PIXMAN_IMAGE_GET_LINE (
4602 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4603 PIXMAN_IMAGE_GET_LINE (
4604 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4605 PIXMAN_IMAGE_GET_LINE (
4606 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4611 src_line += src_stride;
4613 dst_line += dst_stride;
4615 mask_line += mask_stride;
4619 while (w && (unsigned long)dst & 15)
4621 s = 0xff000000 | *src++;
4622 m = (uint32_t) *mask++;
4624 ms = unpack_32_1x128 (s);
4628 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4629 __m128i md = unpack_32_1x128 (d);
4631 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4634 *dst++ = pack_1x128_32 (ms);
4640 m = *(uint32_t*) mask;
4641 xmm_src = _mm_or_si128 (
4642 load_128_unaligned ((__m128i*)src), mask_ff000000);
4644 if (m == 0xffffffff)
4646 save_128_aligned ((__m128i*)dst, xmm_src);
4650 xmm_dst = load_128_aligned ((__m128i*)dst);
4652 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4654 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4655 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4656 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4658 expand_alpha_rev_2x128 (
4659 xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4661 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4662 &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4663 &xmm_dst_lo, &xmm_dst_hi);
4665 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4676 m = (uint32_t) *mask++;
4680 s = 0xff000000 | *src;
4692 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4693 md = unpack_32_1x128 (d);
4694 ms = unpack_32_1x128 (s);
4696 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4710 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4711 pixman_composite_info_t *info)
4713 PIXMAN_COMPOSITE_ARGS (info);
4714 uint32_t *src, *src_line, s;
4715 uint32_t *dst, *dst_line, d;
4716 uint8_t *mask, *mask_line;
4718 int src_stride, mask_stride, dst_stride;
4721 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4722 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4723 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4725 PIXMAN_IMAGE_GET_LINE (
4726 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4727 PIXMAN_IMAGE_GET_LINE (
4728 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4729 PIXMAN_IMAGE_GET_LINE (
4730 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4735 src_line += src_stride;
4737 dst_line += dst_stride;
4739 mask_line += mask_stride;
4743 while (w && (unsigned long)dst & 15)
4748 m = (uint32_t) *mask++;
4755 if (sa == 0xff && m == 0xff)
4761 __m128i ms, md, ma, msa;
4763 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
4764 ms = unpack_32_1x128 (s);
4765 md = unpack_32_1x128 (d);
4767 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
4769 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
4779 m = *(uint32_t *) mask;
4783 xmm_src = load_128_unaligned ((__m128i*)src);
4785 if (m == 0xffffffff && is_opaque (xmm_src))
4787 save_128_aligned ((__m128i *)dst, xmm_src);
4791 xmm_dst = load_128_aligned ((__m128i *)dst);
4793 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4795 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4796 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4797 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4799 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
4800 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4802 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
4803 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
4805 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4820 m = (uint32_t) *mask++;
4827 if (sa == 0xff && m == 0xff)
4833 __m128i ms, md, ma, msa;
4835 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
4836 ms = unpack_32_1x128 (s);
4837 md = unpack_32_1x128 (d);
4839 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
4841 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
4853 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
4854 pixman_composite_info_t *info)
4856 PIXMAN_COMPOSITE_ARGS (info);
4858 uint32_t *dst_line, *dst;
4860 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4861 __m128i xmm_dsta_hi, xmm_dsta_lo;
4865 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4870 PIXMAN_IMAGE_GET_LINE (
4871 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4873 xmm_src = expand_pixel_32_1x128 (src);
4879 dst_line += dst_stride;
4882 while (w && (unsigned long)dst & 15)
4886 vd = unpack_32_1x128 (*dst);
4888 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
4896 __m128i tmp_lo, tmp_hi;
4898 xmm_dst = load_128_aligned ((__m128i*)dst);
4900 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4901 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
4906 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
4907 &xmm_dsta_lo, &xmm_dsta_hi,
4911 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
4921 vd = unpack_32_1x128 (*dst);
4923 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
4934 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
4935 pixman_composite_info_t *info)
4937 PIXMAN_COMPOSITE_ARGS (info);
4938 uint32_t *src, *src_line, s;
4939 uint32_t *dst, *dst_line, d;
4940 uint32_t *mask, *mask_line;
4942 int src_stride, mask_stride, dst_stride;
4945 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4946 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4947 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4949 PIXMAN_IMAGE_GET_LINE (
4950 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4951 PIXMAN_IMAGE_GET_LINE (
4952 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4953 PIXMAN_IMAGE_GET_LINE (
4954 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4959 src_line += src_stride;
4961 dst_line += dst_stride;
4963 mask_line += mask_stride;
4967 while (w && (unsigned long)dst & 15)
4972 m = (*mask++) >> 24;
4979 if (sa == 0xff && m == 0xff)
4985 __m128i ms, md, ma, msa;
4987 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
4988 ms = unpack_32_1x128 (s);
4989 md = unpack_32_1x128 (d);
4991 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
4993 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5003 xmm_mask = load_128_unaligned ((__m128i*)mask);
5005 if (!is_transparent (xmm_mask))
5007 xmm_src = load_128_unaligned ((__m128i*)src);
5009 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5011 save_128_aligned ((__m128i *)dst, xmm_src);
5015 xmm_dst = load_128_aligned ((__m128i *)dst);
5017 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5018 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5019 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5021 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5022 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5024 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5025 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5027 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5042 m = (*mask++) >> 24;
5049 if (sa == 0xff && m == 0xff)
5055 __m128i ms, md, ma, msa;
5057 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5058 ms = unpack_32_1x128 (s);
5059 md = unpack_32_1x128 (d);
5061 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5063 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5074 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5075 static force_inline void
5076 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5080 pixman_fixed_t unit_x,
5081 pixman_fixed_t max_vx,
5082 pixman_bool_t fully_transparent_src)
5085 const uint32_t* pm = NULL;
5087 __m128i xmm_dst_lo, xmm_dst_hi;
5088 __m128i xmm_src_lo, xmm_src_hi;
5089 __m128i xmm_alpha_lo, xmm_alpha_hi;
5091 if (fully_transparent_src)
5094 /* Align dst on a 16-byte boundary */
5095 while (w && ((unsigned long)pd & 15))
5098 s = combine1 (ps + (vx >> 16), pm);
5101 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5110 uint32_t tmp1, tmp2, tmp3, tmp4;
5112 tmp1 = ps[vx >> 16];
5114 tmp2 = ps[vx >> 16];
5116 tmp3 = ps[vx >> 16];
5118 tmp4 = ps[vx >> 16];
5121 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5123 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5125 if (is_opaque (xmm_src_hi))
5127 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5129 else if (!is_zero (xmm_src_hi))
5131 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5133 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5134 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5136 expand_alpha_2x128 (
5137 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5139 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5140 &xmm_alpha_lo, &xmm_alpha_hi,
5141 &xmm_dst_lo, &xmm_dst_hi);
5143 /* rebuid the 4 pixel data and save*/
5144 save_128_aligned ((__m128i*)pd,
5145 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5157 s = combine1 (ps + (vx >> 16), pm);
5160 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5168 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5169 scaled_nearest_scanline_sse2_8888_8888_OVER,
5170 uint32_t, uint32_t, COVER)
5171 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5172 scaled_nearest_scanline_sse2_8888_8888_OVER,
5173 uint32_t, uint32_t, NONE)
5174 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5175 scaled_nearest_scanline_sse2_8888_8888_OVER,
5176 uint32_t, uint32_t, PAD)
5178 static force_inline void
5179 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5181 const uint32_t * src,
5184 pixman_fixed_t unit_x,
5185 pixman_fixed_t max_vx,
5186 pixman_bool_t zero_src)
5189 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5190 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5191 __m128i xmm_alpha_lo, xmm_alpha_hi;
5193 if (zero_src || (*mask >> 24) == 0)
5196 xmm_mask = create_mask_16_128 (*mask >> 24);
5198 while (w && (unsigned long)dst & 15)
5200 uint32_t s = src[pixman_fixed_to_int (vx)];
5207 __m128i ms = unpack_32_1x128 (s);
5208 __m128i alpha = expand_alpha_1x128 (ms);
5209 __m128i dest = xmm_mask;
5210 __m128i alpha_dst = unpack_32_1x128 (d);
5212 *dst = pack_1x128_32 (
5213 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5221 uint32_t tmp1, tmp2, tmp3, tmp4;
5223 tmp1 = src[pixman_fixed_to_int (vx)];
5225 tmp2 = src[pixman_fixed_to_int (vx)];
5227 tmp3 = src[pixman_fixed_to_int (vx)];
5229 tmp4 = src[pixman_fixed_to_int (vx)];
5232 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5234 if (!is_zero (xmm_src))
5236 xmm_dst = load_128_aligned ((__m128i*)dst);
5238 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5239 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5240 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5241 &xmm_alpha_lo, &xmm_alpha_hi);
5243 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5244 &xmm_alpha_lo, &xmm_alpha_hi,
5245 &xmm_mask, &xmm_mask,
5246 &xmm_dst_lo, &xmm_dst_hi);
5249 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5258 uint32_t s = src[pixman_fixed_to_int (vx)];
5265 __m128i ms = unpack_32_1x128 (s);
5266 __m128i alpha = expand_alpha_1x128 (ms);
5267 __m128i mask = xmm_mask;
5268 __m128i dest = unpack_32_1x128 (d);
5270 *dst = pack_1x128_32 (
5271 in_over_1x128 (&ms, &alpha, &mask, &dest));
5280 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5281 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5282 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5283 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5284 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5285 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5286 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5287 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5288 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5290 #define BILINEAR_DECLARE_VARIABLES \
5291 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5292 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5293 const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\
5294 const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \
5295 const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \
5296 unit_x, unit_x, unit_x, unit_x); \
5297 const __m128i xmm_zero = _mm_setzero_si128 (); \
5298 __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
5300 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
5302 __m128i xmm_wh, xmm_lo, xmm_hi, a; \
5303 /* fetch 2x2 pixel block into sse2 register */ \
5304 uint32_t tl = src_top [pixman_fixed_to_int (vx)]; \
5305 uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1]; \
5306 uint32_t bl = src_bottom [pixman_fixed_to_int (vx)]; \
5307 uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1]; \
5308 a = _mm_set_epi32 (tr, tl, br, bl); \
5310 /* vertical interpolation */ \
5311 a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \
5313 _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \
5315 /* calculate horizontal weights */ \
5316 xmm_wh = _mm_add_epi16 (xmm_addc, \
5317 _mm_xor_si128 (xmm_xorc, \
5318 _mm_srli_epi16 (xmm_x, 8))); \
5319 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
5320 /* horizontal interpolation */ \
5321 xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
5322 xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
5323 a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
5324 _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
5325 /* shift and pack the result */ \
5326 a = _mm_srli_epi32 (a, 16); \
5327 a = _mm_packs_epi32 (a, a); \
5328 a = _mm_packus_epi16 (a, a); \
5329 pix = _mm_cvtsi128_si32 (a); \
5332 #define BILINEAR_SKIP_ONE_PIXEL() \
5335 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
5338 static force_inline void
5339 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
5340 const uint32_t * mask,
5341 const uint32_t * src_top,
5342 const uint32_t * src_bottom,
5347 pixman_fixed_t unit_x,
5348 pixman_fixed_t max_vx,
5349 pixman_bool_t zero_src)
5351 BILINEAR_DECLARE_VARIABLES;
5352 uint32_t pix1, pix2, pix3, pix4;
5354 while ((w -= 4) >= 0)
5356 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5357 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5358 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5359 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5368 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5369 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5376 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5382 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5383 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5384 uint32_t, uint32_t, uint32_t,
5386 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5387 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5388 uint32_t, uint32_t, uint32_t,
5390 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5391 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5392 uint32_t, uint32_t, uint32_t,
5394 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5395 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5396 uint32_t, uint32_t, uint32_t,
5399 static force_inline void
5400 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
5401 const uint32_t * mask,
5402 const uint32_t * src_top,
5403 const uint32_t * src_bottom,
5408 pixman_fixed_t unit_x,
5409 pixman_fixed_t max_vx,
5410 pixman_bool_t zero_src)
5412 BILINEAR_DECLARE_VARIABLES;
5413 uint32_t pix1, pix2, pix3, pix4;
5415 while (w && ((unsigned long)dst & 15))
5417 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5422 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5432 __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5433 __m128i xmm_alpha_hi, xmm_alpha_lo;
5435 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5436 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5437 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5438 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5440 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5442 if (!is_zero (xmm_src))
5444 if (is_opaque (xmm_src))
5446 save_128_aligned ((__m128i *)dst, xmm_src);
5450 __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5452 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5453 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5455 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5456 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5457 &xmm_dst_lo, &xmm_dst_hi);
5459 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5469 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5474 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5482 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5483 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5484 uint32_t, uint32_t, uint32_t,
5486 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5487 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5488 uint32_t, uint32_t, uint32_t,
5490 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5491 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5492 uint32_t, uint32_t, uint32_t,
5494 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5495 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5496 uint32_t, uint32_t, uint32_t,
5499 static force_inline void
5500 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
5501 const uint8_t * mask,
5502 const uint32_t * src_top,
5503 const uint32_t * src_bottom,
5508 pixman_fixed_t unit_x,
5509 pixman_fixed_t max_vx,
5510 pixman_bool_t zero_src)
5512 BILINEAR_DECLARE_VARIABLES;
5513 uint32_t pix1, pix2, pix3, pix4;
5516 while (w && ((unsigned long)dst & 15))
5520 m = (uint32_t) *mask++;
5524 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5527 if (sa == 0xff && m == 0xff)
5533 __m128i ms, md, ma, msa;
5536 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5537 ms = unpack_32_1x128 (pix1);
5538 md = unpack_32_1x128 (pix2);
5540 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5542 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5547 BILINEAR_SKIP_ONE_PIXEL ();
5556 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5557 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5558 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5560 m = *(uint32_t*)mask;
5564 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5565 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5566 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5567 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5569 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5571 if (m == 0xffffffff && is_opaque (xmm_src))
5573 save_128_aligned ((__m128i *)dst, xmm_src);
5577 xmm_dst = load_128_aligned ((__m128i *)dst);
5579 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5581 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5582 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5583 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5585 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5586 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5588 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5589 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5591 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5596 BILINEAR_SKIP_ONE_PIXEL ();
5597 BILINEAR_SKIP_ONE_PIXEL ();
5598 BILINEAR_SKIP_ONE_PIXEL ();
5599 BILINEAR_SKIP_ONE_PIXEL ();
5611 m = (uint32_t) *mask++;
5615 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5618 if (sa == 0xff && m == 0xff)
5624 __m128i ms, md, ma, msa;
5627 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5628 ms = unpack_32_1x128 (pix1);
5629 md = unpack_32_1x128 (pix2);
5631 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5633 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5638 BILINEAR_SKIP_ONE_PIXEL ();
5646 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
5647 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5648 uint32_t, uint8_t, uint32_t,
5649 COVER, FLAG_HAVE_NON_SOLID_MASK)
5650 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
5651 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5652 uint32_t, uint8_t, uint32_t,
5653 PAD, FLAG_HAVE_NON_SOLID_MASK)
5654 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
5655 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5656 uint32_t, uint8_t, uint32_t,
5657 NONE, FLAG_HAVE_NON_SOLID_MASK)
5658 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
5659 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5660 uint32_t, uint8_t, uint32_t,
5661 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
5663 static const pixman_fast_path_t sse2_fast_paths[] =
5665 /* PIXMAN_OP_OVER */
5666 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5667 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5668 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5669 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5670 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5671 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5672 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5673 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5674 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5675 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5676 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5677 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5678 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5679 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5680 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5681 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5682 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5683 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5684 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5685 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5686 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5687 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5688 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5689 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5690 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5691 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5692 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5693 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5694 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5695 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5696 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5697 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5698 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5699 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5700 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5701 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5702 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5703 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5704 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5705 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5706 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5707 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5708 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5709 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5710 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5711 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5713 /* PIXMAN_OP_OVER_REVERSE */
5714 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5715 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5718 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5719 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5720 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5721 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5722 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5723 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5726 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5727 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5728 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5729 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5730 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5731 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5732 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5733 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5734 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5735 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5736 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5737 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5738 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5739 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5742 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5743 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5744 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5746 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5747 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5748 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5749 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5750 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5751 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5752 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5753 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5754 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5755 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5756 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5757 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5759 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
5760 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
5761 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
5762 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
5764 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5765 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5766 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
5768 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5769 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5770 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5771 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5773 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
5774 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
5775 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
5776 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
5781 static pixman_bool_t
5782 sse2_blt (pixman_implementation_t *imp,
5783 uint32_t * src_bits,
5784 uint32_t * dst_bits,
5796 if (!pixman_blt_sse2 (
5797 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5798 src_x, src_y, dest_x, dest_y, width, height))
5801 return _pixman_implementation_blt (
5803 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5804 src_x, src_y, dest_x, dest_y, width, height);
5810 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5811 __attribute__((__force_align_arg_pointer__))
5813 static pixman_bool_t
5814 sse2_fill (pixman_implementation_t *imp,
5824 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5826 return _pixman_implementation_fill (
5827 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5834 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5836 int w = iter->width;
5837 __m128i ff000000 = mask_ff000000;
5838 uint32_t *dst = iter->buffer;
5839 uint32_t *src = (uint32_t *)iter->bits;
5841 iter->bits += iter->stride;
5843 while (w && ((unsigned long)dst) & 0x0f)
5845 *dst++ = (*src++) | 0xff000000;
5852 (__m128i *)dst, _mm_or_si128 (
5853 load_128_unaligned ((__m128i *)src), ff000000));
5862 *dst++ = (*src++) | 0xff000000;
5866 return iter->buffer;
5870 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
5872 int w = iter->width;
5873 uint32_t *dst = iter->buffer;
5874 uint16_t *src = (uint16_t *)iter->bits;
5875 __m128i ff000000 = mask_ff000000;
5877 iter->bits += iter->stride;
5879 while (w && ((unsigned long)dst) & 0x0f)
5881 uint16_t s = *src++;
5883 *dst++ = CONVERT_0565_TO_8888 (s);
5891 s = _mm_loadu_si128 ((__m128i *)src);
5893 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
5894 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
5896 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
5897 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
5906 uint16_t s = *src++;
5908 *dst++ = CONVERT_0565_TO_8888 (s);
5912 return iter->buffer;
5916 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5918 int w = iter->width;
5919 uint32_t *dst = iter->buffer;
5920 uint8_t *src = iter->bits;
5921 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5923 iter->bits += iter->stride;
5925 while (w && (((unsigned long)dst) & 15))
5927 *dst++ = *(src++) << 24;
5933 xmm0 = _mm_loadu_si128((__m128i *)src);
5935 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
5936 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
5937 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
5938 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
5939 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
5940 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
5942 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
5943 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
5944 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
5945 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
5954 *dst++ = *(src++) << 24;
5958 return iter->buffer;
5963 pixman_format_code_t format;
5964 pixman_iter_get_scanline_t get_scanline;
5967 static const fetcher_info_t fetchers[] =
5969 { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
5970 { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
5971 { PIXMAN_a8, sse2_fetch_a8 },
5976 sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
5978 pixman_image_t *image = iter->image;
5981 int width = iter->width;
5982 int height = iter->height;
5985 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
5987 if ((iter->flags & ITER_NARROW) &&
5988 (image->common.flags & FLAGS) == FLAGS &&
5990 x + width <= image->bits.width &&
5991 y + height <= image->bits.height)
5993 const fetcher_info_t *f;
5995 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
5997 if (image->common.extended_format_code == f->format)
5999 uint8_t *b = (uint8_t *)image->bits.bits;
6000 int s = image->bits.rowstride * 4;
6002 iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6005 iter->get_scanline = f->get_scanline;
6011 imp->delegate->src_iter_init (imp->delegate, iter);
6014 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6015 __attribute__((__force_align_arg_pointer__))
6017 pixman_implementation_t *
6018 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6020 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6022 /* SSE2 constants */
6023 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6024 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6025 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6026 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6027 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6028 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6029 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6030 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6031 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6032 mask_0080 = create_mask_16_128 (0x0080);
6033 mask_00ff = create_mask_16_128 (0x00ff);
6034 mask_0101 = create_mask_16_128 (0x0101);
6035 mask_ffff = create_mask_16_128 (0xffff);
6036 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6037 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6039 /* Set up function pointers */
6040 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6041 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6042 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6043 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6044 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6045 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6046 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6047 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6048 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6049 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6051 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6053 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6054 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6055 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6056 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6057 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6058 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6059 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6060 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6061 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6062 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6063 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6065 imp->blt = sse2_blt;
6066 imp->fill = sse2_fill;
6068 imp->src_iter_init = sse2_src_iter_init;