2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38 #include "pixman-fast-path.h"
40 #if defined(_MSC_VER) && defined(_M_AMD64)
41 /* Windows 64 doesn't allow MMX to be used, so
42 * the pixman-x64-mmx-emulation.h file contains
43 * implementations of those MMX intrinsics that
44 * are used in the SSE2 implementation.
46 # include "pixman-x64-mmx-emulation.h"
51 /* --------------------------------------------------------------------
55 static __m64 mask_x0080;
56 static __m64 mask_x00ff;
57 static __m64 mask_x0101;
58 static __m64 mask_x_alpha;
60 static __m64 mask_x565_rgb;
61 static __m64 mask_x565_unpack;
63 static __m128i mask_0080;
64 static __m128i mask_00ff;
65 static __m128i mask_0101;
66 static __m128i mask_ffff;
67 static __m128i mask_ff000000;
68 static __m128i mask_alpha;
70 static __m128i mask_565_r;
71 static __m128i mask_565_g1, mask_565_g2;
72 static __m128i mask_565_b;
73 static __m128i mask_red;
74 static __m128i mask_green;
75 static __m128i mask_blue;
77 static __m128i mask_565_fix_rb;
78 static __m128i mask_565_fix_g;
80 /* ----------------------------------------------------------------------
83 static force_inline __m128i
84 unpack_32_1x128 (uint32_t data)
86 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
89 static force_inline void
90 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
92 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
93 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
96 static force_inline __m128i
97 unpack_565_to_8888 (__m128i lo)
99 __m128i r, g, b, rb, t;
101 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
102 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
103 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
105 rb = _mm_or_si128 (r, b);
106 t = _mm_and_si128 (rb, mask_565_fix_rb);
107 t = _mm_srli_epi32 (t, 5);
108 rb = _mm_or_si128 (rb, t);
110 t = _mm_and_si128 (g, mask_565_fix_g);
111 t = _mm_srli_epi32 (t, 6);
112 g = _mm_or_si128 (g, t);
114 return _mm_or_si128 (rb, g);
117 static force_inline void
118 unpack_565_128_4x128 (__m128i data,
126 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
127 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
129 lo = unpack_565_to_8888 (lo);
130 hi = unpack_565_to_8888 (hi);
132 unpack_128_2x128 (lo, data0, data1);
133 unpack_128_2x128 (hi, data2, data3);
136 static force_inline uint16_t
137 pack_565_32_16 (uint32_t pixel)
139 return (uint16_t) (((pixel >> 8) & 0xf800) |
140 ((pixel >> 5) & 0x07e0) |
141 ((pixel >> 3) & 0x001f));
144 static force_inline __m128i
145 pack_2x128_128 (__m128i lo, __m128i hi)
147 return _mm_packus_epi16 (lo, hi);
150 static force_inline __m128i
151 pack_565_2x128_128 (__m128i lo, __m128i hi)
154 __m128i r, g1, g2, b;
156 data = pack_2x128_128 (lo, hi);
158 r = _mm_and_si128 (data, mask_565_r);
159 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
160 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
161 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
163 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
166 static force_inline __m128i
167 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
169 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
170 pack_565_2x128_128 (*xmm2, *xmm3));
173 static force_inline int
174 is_opaque (__m128i x)
176 __m128i ffs = _mm_cmpeq_epi8 (x, x);
178 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
181 static force_inline int
184 return _mm_movemask_epi8 (
185 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
188 static force_inline int
189 is_transparent (__m128i x)
191 return (_mm_movemask_epi8 (
192 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
195 static force_inline __m128i
196 expand_pixel_32_1x128 (uint32_t data)
198 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
201 static force_inline __m128i
202 expand_alpha_1x128 (__m128i data)
204 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
205 _MM_SHUFFLE (3, 3, 3, 3)),
206 _MM_SHUFFLE (3, 3, 3, 3));
209 static force_inline void
210 expand_alpha_2x128 (__m128i data_lo,
217 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
218 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
221 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
224 static force_inline void
225 expand_alpha_rev_2x128 (__m128i data_lo,
232 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
233 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
235 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
238 static force_inline void
239 pix_multiply_2x128 (__m128i* data_lo,
248 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
249 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
250 lo = _mm_adds_epu16 (lo, mask_0080);
251 hi = _mm_adds_epu16 (hi, mask_0080);
252 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
253 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
256 static force_inline void
257 pix_add_multiply_2x128 (__m128i* src_lo,
259 __m128i* alpha_dst_lo,
260 __m128i* alpha_dst_hi,
263 __m128i* alpha_src_lo,
264 __m128i* alpha_src_hi,
268 __m128i t1_lo, t1_hi;
269 __m128i t2_lo, t2_hi;
271 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
272 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
274 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
275 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
278 static force_inline void
279 negate_2x128 (__m128i data_lo,
284 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
285 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
288 static force_inline void
289 invert_colors_2x128 (__m128i data_lo,
296 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
297 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
298 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
299 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
302 static force_inline void
303 over_2x128 (__m128i* src_lo,
312 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
314 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
316 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
317 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
320 static force_inline void
321 over_rev_non_pre_2x128 (__m128i src_lo,
327 __m128i alpha_lo, alpha_hi;
329 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
331 lo = _mm_or_si128 (alpha_lo, mask_alpha);
332 hi = _mm_or_si128 (alpha_hi, mask_alpha);
334 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
336 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
338 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
341 static force_inline void
342 in_over_2x128 (__m128i* src_lo,
354 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
355 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
357 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
360 /* load 4 pixels from a 16-byte boundary aligned address */
361 static force_inline __m128i
362 load_128_aligned (__m128i* src)
364 return _mm_load_si128 (src);
367 /* load 4 pixels from a unaligned address */
368 static force_inline __m128i
369 load_128_unaligned (const __m128i* src)
371 return _mm_loadu_si128 (src);
374 /* save 4 pixels using Write Combining memory on a 16-byte
375 * boundary aligned address
377 static force_inline void
378 save_128_write_combining (__m128i* dst,
381 _mm_stream_si128 (dst, data);
384 /* save 4 pixels on a 16-byte boundary aligned address */
385 static force_inline void
386 save_128_aligned (__m128i* dst,
389 _mm_store_si128 (dst, data);
392 /* save 4 pixels on a unaligned address */
393 static force_inline void
394 save_128_unaligned (__m128i* dst,
397 _mm_storeu_si128 (dst, data);
400 /* ------------------------------------------------------------------
404 static force_inline __m64
405 load_32_1x64 (uint32_t data)
407 return _mm_cvtsi32_si64 (data);
410 static force_inline __m64
411 unpack_32_1x64 (uint32_t data)
413 return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
416 static force_inline __m64
417 expand_alpha_1x64 (__m64 data)
419 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
422 static force_inline __m64
423 expand_alpha_rev_1x64 (__m64 data)
425 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
428 static force_inline __m64
429 expand_pixel_8_1x64 (uint8_t data)
431 return _mm_shuffle_pi16 (
432 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
435 static force_inline __m64
436 pix_multiply_1x64 (__m64 data,
439 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
444 static force_inline __m64
445 pix_add_multiply_1x64 (__m64* src,
450 __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
451 __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
453 return _mm_adds_pu8 (t1, t2);
456 static force_inline __m64
457 negate_1x64 (__m64 data)
459 return _mm_xor_si64 (data, mask_x00ff);
462 static force_inline __m64
463 invert_colors_1x64 (__m64 data)
465 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
468 static force_inline __m64
469 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
471 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
474 static force_inline __m64
475 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
477 return over_1x64 (pix_multiply_1x64 (*src, *mask),
478 pix_multiply_1x64 (*alpha, *mask),
482 static force_inline __m64
483 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
485 __m64 alpha = expand_alpha_1x64 (src);
487 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
488 _mm_or_si64 (alpha, mask_x_alpha)),
493 static force_inline uint32_t
494 pack_1x64_32 (__m64 data)
496 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
499 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
503 * --- Expanding 565 in the low word ---
505 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
506 * m = m & (01f0003f001f);
507 * m = m * (008404100840);
510 * Note the trick here - the top word is shifted by another nibble to
511 * avoid it bumping into the middle word
513 static force_inline __m64
514 expand565_16_1x64 (uint16_t pixel)
519 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
521 t1 = _mm_slli_si64 (p, 36 - 11);
522 t2 = _mm_slli_si64 (p, 16 - 5);
524 p = _mm_or_si64 (t1, p);
525 p = _mm_or_si64 (t2, p);
526 p = _mm_and_si64 (p, mask_x565_rgb);
527 p = _mm_mullo_pi16 (p, mask_x565_unpack);
529 return _mm_srli_pi16 (p, 8);
532 /* ----------------------------------------------------------------------------
533 * Compose Core transformations
535 static force_inline uint32_t
536 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
549 ms = unpack_32_1x64 (src);
550 return pack_1x64_32 (
551 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
557 static force_inline uint32_t
558 combine1 (const uint32_t *ps, const uint32_t *pm)
566 mm = unpack_32_1x64 (*pm);
567 mm = expand_alpha_1x64 (mm);
569 ms = unpack_32_1x64 (s);
570 ms = pix_multiply_1x64 (ms, mm);
572 s = pack_1x64_32 (ms);
578 static force_inline __m128i
579 combine4 (const __m128i *ps, const __m128i *pm)
581 __m128i xmm_src_lo, xmm_src_hi;
582 __m128i xmm_msk_lo, xmm_msk_hi;
587 xmm_msk_lo = load_128_unaligned (pm);
589 if (is_transparent (xmm_msk_lo))
590 return _mm_setzero_si128 ();
593 s = load_128_unaligned (ps);
597 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
598 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
600 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
602 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
603 &xmm_msk_lo, &xmm_msk_hi,
604 &xmm_src_lo, &xmm_src_hi);
606 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
612 static force_inline void
613 core_combine_over_u_sse2_mask (uint32_t * pd,
620 /* Align dst on a 16-byte boundary */
621 while (w && ((unsigned long)pd & 15))
624 s = combine1 (ps, pm);
627 *pd = core_combine_over_u_pixel_sse2 (s, d);
636 __m128i mask = load_128_unaligned ((__m128i *)pm);
641 __m128i src_hi, src_lo;
642 __m128i mask_hi, mask_lo;
643 __m128i alpha_hi, alpha_lo;
645 src = load_128_unaligned ((__m128i *)ps);
647 if (is_opaque (_mm_and_si128 (src, mask)))
649 save_128_aligned ((__m128i *)pd, src);
653 __m128i dst = load_128_aligned ((__m128i *)pd);
654 __m128i dst_hi, dst_lo;
656 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
657 unpack_128_2x128 (src, &src_lo, &src_hi);
659 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
660 pix_multiply_2x128 (&src_lo, &src_hi,
664 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
666 expand_alpha_2x128 (src_lo, src_hi,
667 &alpha_lo, &alpha_hi);
669 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
674 pack_2x128_128 (dst_lo, dst_hi));
686 s = combine1 (ps, pm);
689 *pd = core_combine_over_u_pixel_sse2 (s, d);
698 static force_inline void
699 core_combine_over_u_sse2_no_mask (uint32_t * pd,
705 /* Align dst on a 16-byte boundary */
706 while (w && ((unsigned long)pd & 15))
712 *pd = core_combine_over_u_pixel_sse2 (s, d);
721 __m128i src_hi, src_lo, dst_hi, dst_lo;
722 __m128i alpha_hi, alpha_lo;
724 src = load_128_unaligned ((__m128i *)ps);
730 save_128_aligned ((__m128i *)pd, src);
734 __m128i dst = load_128_aligned ((__m128i *)pd);
736 unpack_128_2x128 (src, &src_lo, &src_hi);
737 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
739 expand_alpha_2x128 (src_lo, src_hi,
740 &alpha_lo, &alpha_hi);
741 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
746 pack_2x128_128 (dst_lo, dst_hi));
760 *pd = core_combine_over_u_pixel_sse2 (s, d);
768 static force_inline void
769 core_combine_over_u_sse2 (uint32_t* pd,
775 core_combine_over_u_sse2_mask (pd, ps, pm, w);
777 core_combine_over_u_sse2_no_mask (pd, ps, w);
780 static force_inline void
781 core_combine_over_reverse_u_sse2 (uint32_t* pd,
788 __m128i xmm_dst_lo, xmm_dst_hi;
789 __m128i xmm_src_lo, xmm_src_hi;
790 __m128i xmm_alpha_lo, xmm_alpha_hi;
792 /* Align dst on a 16-byte boundary */
794 ((unsigned long)pd & 15))
797 s = combine1 (ps, pm);
799 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
808 /* I'm loading unaligned because I'm not sure
809 * about the address alignment.
811 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
812 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
814 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
815 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
817 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
818 &xmm_alpha_lo, &xmm_alpha_hi);
820 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
821 &xmm_alpha_lo, &xmm_alpha_hi,
822 &xmm_src_lo, &xmm_src_hi);
824 /* rebuid the 4 pixel data and save*/
825 save_128_aligned ((__m128i*)pd,
826 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
839 s = combine1 (ps, pm);
841 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
849 static force_inline uint32_t
850 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
852 uint32_t maska = src >> 24;
858 else if (maska != 0xff)
860 return pack_1x64_32 (
861 pix_multiply_1x64 (unpack_32_1x64 (dst),
862 expand_alpha_1x64 (unpack_32_1x64 (src))));
868 static force_inline void
869 core_combine_in_u_sse2 (uint32_t* pd,
876 __m128i xmm_src_lo, xmm_src_hi;
877 __m128i xmm_dst_lo, xmm_dst_hi;
879 while (w && ((unsigned long) pd & 15))
881 s = combine1 (ps, pm);
884 *pd++ = core_combine_in_u_pixelsse2 (d, s);
893 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
894 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
896 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
897 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
899 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
900 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
901 &xmm_dst_lo, &xmm_dst_hi,
902 &xmm_dst_lo, &xmm_dst_hi);
904 save_128_aligned ((__m128i*)pd,
905 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
916 s = combine1 (ps, pm);
919 *pd++ = core_combine_in_u_pixelsse2 (d, s);
927 static force_inline void
928 core_combine_reverse_in_u_sse2 (uint32_t* pd,
935 __m128i xmm_src_lo, xmm_src_hi;
936 __m128i xmm_dst_lo, xmm_dst_hi;
938 while (w && ((unsigned long) pd & 15))
940 s = combine1 (ps, pm);
943 *pd++ = core_combine_in_u_pixelsse2 (s, d);
952 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
953 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
955 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
956 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
958 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
959 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
960 &xmm_src_lo, &xmm_src_hi,
961 &xmm_dst_lo, &xmm_dst_hi);
964 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
975 s = combine1 (ps, pm);
978 *pd++ = core_combine_in_u_pixelsse2 (s, d);
986 static force_inline void
987 core_combine_reverse_out_u_sse2 (uint32_t* pd,
992 while (w && ((unsigned long) pd & 15))
994 uint32_t s = combine1 (ps, pm);
997 *pd++ = pack_1x64_32 (
999 unpack_32_1x64 (d), negate_1x64 (
1000 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1010 __m128i xmm_src_lo, xmm_src_hi;
1011 __m128i xmm_dst_lo, xmm_dst_hi;
1013 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1014 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1016 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1017 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1019 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1020 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1022 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1023 &xmm_src_lo, &xmm_src_hi,
1024 &xmm_dst_lo, &xmm_dst_hi);
1027 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1039 uint32_t s = combine1 (ps, pm);
1042 *pd++ = pack_1x64_32 (
1044 unpack_32_1x64 (d), negate_1x64 (
1045 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1053 static force_inline void
1054 core_combine_out_u_sse2 (uint32_t* pd,
1059 while (w && ((unsigned long) pd & 15))
1061 uint32_t s = combine1 (ps, pm);
1064 *pd++ = pack_1x64_32 (
1066 unpack_32_1x64 (s), negate_1x64 (
1067 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1076 __m128i xmm_src_lo, xmm_src_hi;
1077 __m128i xmm_dst_lo, xmm_dst_hi;
1079 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1080 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1082 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1083 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1085 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1086 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1088 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1089 &xmm_dst_lo, &xmm_dst_hi,
1090 &xmm_dst_lo, &xmm_dst_hi);
1093 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1104 uint32_t s = combine1 (ps, pm);
1107 *pd++ = pack_1x64_32 (
1109 unpack_32_1x64 (s), negate_1x64 (
1110 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1118 static force_inline uint32_t
1119 core_combine_atop_u_pixel_sse2 (uint32_t src,
1122 __m64 s = unpack_32_1x64 (src);
1123 __m64 d = unpack_32_1x64 (dst);
1125 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1126 __m64 da = expand_alpha_1x64 (d);
1128 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1131 static force_inline void
1132 core_combine_atop_u_sse2 (uint32_t* pd,
1139 __m128i xmm_src_lo, xmm_src_hi;
1140 __m128i xmm_dst_lo, xmm_dst_hi;
1141 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1142 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1144 while (w && ((unsigned long) pd & 15))
1146 s = combine1 (ps, pm);
1149 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1158 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1159 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1161 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1162 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1164 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1165 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1166 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1167 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1169 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1170 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1172 pix_add_multiply_2x128 (
1173 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1174 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1175 &xmm_dst_lo, &xmm_dst_hi);
1178 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1189 s = combine1 (ps, pm);
1192 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1200 static force_inline uint32_t
1201 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1204 __m64 s = unpack_32_1x64 (src);
1205 __m64 d = unpack_32_1x64 (dst);
1207 __m64 sa = expand_alpha_1x64 (s);
1208 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1210 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1213 static force_inline void
1214 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1221 __m128i xmm_src_lo, xmm_src_hi;
1222 __m128i xmm_dst_lo, xmm_dst_hi;
1223 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1224 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1226 while (w && ((unsigned long) pd & 15))
1228 s = combine1 (ps, pm);
1231 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1240 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1241 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1243 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1244 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1246 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1247 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1248 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1249 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1251 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1252 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1254 pix_add_multiply_2x128 (
1255 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1256 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1257 &xmm_dst_lo, &xmm_dst_hi);
1260 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1271 s = combine1 (ps, pm);
1274 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1282 static force_inline uint32_t
1283 core_combine_xor_u_pixel_sse2 (uint32_t src,
1286 __m64 s = unpack_32_1x64 (src);
1287 __m64 d = unpack_32_1x64 (dst);
1289 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1290 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1292 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1295 static force_inline void
1296 core_combine_xor_u_sse2 (uint32_t* dst,
1297 const uint32_t* src,
1298 const uint32_t *mask,
1304 const uint32_t* ps = src;
1305 const uint32_t* pm = mask;
1307 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1308 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1309 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1310 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1312 while (w && ((unsigned long) pd & 15))
1314 s = combine1 (ps, pm);
1317 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1326 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1327 xmm_dst = load_128_aligned ((__m128i*) pd);
1329 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1330 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1332 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1333 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1334 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1335 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1337 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1338 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1339 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1340 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1342 pix_add_multiply_2x128 (
1343 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1344 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1345 &xmm_dst_lo, &xmm_dst_hi);
1348 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1359 s = combine1 (ps, pm);
1362 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1370 static force_inline void
1371 core_combine_add_u_sse2 (uint32_t* dst,
1372 const uint32_t* src,
1373 const uint32_t* mask,
1379 const uint32_t* ps = src;
1380 const uint32_t* pm = mask;
1382 while (w && (unsigned long)pd & 15)
1384 s = combine1 (ps, pm);
1390 *pd++ = _mm_cvtsi64_si32 (
1391 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1399 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1402 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1413 s = combine1 (ps, pm);
1417 *pd++ = _mm_cvtsi64_si32 (
1418 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1424 static force_inline uint32_t
1425 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1428 __m64 ms = unpack_32_1x64 (src);
1429 __m64 md = unpack_32_1x64 (dst);
1430 uint32_t sa = src >> 24;
1431 uint32_t da = ~dst >> 24;
1435 ms = pix_multiply_1x64 (
1436 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1439 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1442 static force_inline void
1443 core_combine_saturate_u_sse2 (uint32_t * pd,
1451 __m128i xmm_src, xmm_dst;
1453 while (w && (unsigned long)pd & 15)
1455 s = combine1 (ps, pm);
1458 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1467 xmm_dst = load_128_aligned ((__m128i*)pd);
1468 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1470 pack_cmp = _mm_movemask_epi8 (
1472 _mm_srli_epi32 (xmm_src, 24),
1473 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1475 /* if some alpha src is grater than respective ~alpha dst */
1478 s = combine1 (ps++, pm);
1480 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1484 s = combine1 (ps++, pm);
1486 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1490 s = combine1 (ps++, pm);
1492 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1496 s = combine1 (ps++, pm);
1498 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1504 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1517 s = combine1 (ps, pm);
1520 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1527 static force_inline void
1528 core_combine_src_ca_sse2 (uint32_t* pd,
1535 __m128i xmm_src_lo, xmm_src_hi;
1536 __m128i xmm_mask_lo, xmm_mask_hi;
1537 __m128i xmm_dst_lo, xmm_dst_hi;
1539 while (w && (unsigned long)pd & 15)
1543 *pd++ = pack_1x64_32 (
1544 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1550 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1551 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1553 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1554 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1556 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1557 &xmm_mask_lo, &xmm_mask_hi,
1558 &xmm_dst_lo, &xmm_dst_hi);
1561 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1573 *pd++ = pack_1x64_32 (
1574 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1579 static force_inline uint32_t
1580 core_combine_over_ca_pixel_sse2 (uint32_t src,
1584 __m64 s = unpack_32_1x64 (src);
1585 __m64 expAlpha = expand_alpha_1x64 (s);
1586 __m64 unpk_mask = unpack_32_1x64 (mask);
1587 __m64 unpk_dst = unpack_32_1x64 (dst);
1589 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1592 static force_inline void
1593 core_combine_over_ca_sse2 (uint32_t* pd,
1600 __m128i xmm_alpha_lo, xmm_alpha_hi;
1601 __m128i xmm_src_lo, xmm_src_hi;
1602 __m128i xmm_dst_lo, xmm_dst_hi;
1603 __m128i xmm_mask_lo, xmm_mask_hi;
1605 while (w && (unsigned long)pd & 15)
1611 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1617 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1618 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1619 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1621 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1622 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1623 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1625 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1626 &xmm_alpha_lo, &xmm_alpha_hi);
1628 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1629 &xmm_alpha_lo, &xmm_alpha_hi,
1630 &xmm_mask_lo, &xmm_mask_hi,
1631 &xmm_dst_lo, &xmm_dst_hi);
1634 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1648 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1653 static force_inline uint32_t
1654 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1658 __m64 d = unpack_32_1x64 (dst);
1660 return pack_1x64_32 (
1661 over_1x64 (d, expand_alpha_1x64 (d),
1662 pix_multiply_1x64 (unpack_32_1x64 (src),
1663 unpack_32_1x64 (mask))));
1666 static force_inline void
1667 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1674 __m128i xmm_alpha_lo, xmm_alpha_hi;
1675 __m128i xmm_src_lo, xmm_src_hi;
1676 __m128i xmm_dst_lo, xmm_dst_hi;
1677 __m128i xmm_mask_lo, xmm_mask_hi;
1679 while (w && (unsigned long)pd & 15)
1685 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1691 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1692 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1693 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1695 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1696 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1697 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1699 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1700 &xmm_alpha_lo, &xmm_alpha_hi);
1701 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1702 &xmm_mask_lo, &xmm_mask_hi,
1703 &xmm_mask_lo, &xmm_mask_hi);
1705 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1706 &xmm_alpha_lo, &xmm_alpha_hi,
1707 &xmm_mask_lo, &xmm_mask_hi);
1710 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1724 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1729 static force_inline void
1730 core_combine_in_ca_sse2 (uint32_t * pd,
1737 __m128i xmm_alpha_lo, xmm_alpha_hi;
1738 __m128i xmm_src_lo, xmm_src_hi;
1739 __m128i xmm_dst_lo, xmm_dst_hi;
1740 __m128i xmm_mask_lo, xmm_mask_hi;
1742 while (w && (unsigned long)pd & 15)
1748 *pd++ = pack_1x64_32 (
1750 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1751 expand_alpha_1x64 (unpack_32_1x64 (d))));
1758 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1759 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1760 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1762 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1763 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1764 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1766 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1767 &xmm_alpha_lo, &xmm_alpha_hi);
1769 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1770 &xmm_mask_lo, &xmm_mask_hi,
1771 &xmm_dst_lo, &xmm_dst_hi);
1773 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1774 &xmm_alpha_lo, &xmm_alpha_hi,
1775 &xmm_dst_lo, &xmm_dst_hi);
1778 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1792 *pd++ = pack_1x64_32 (
1795 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1796 expand_alpha_1x64 (unpack_32_1x64 (d))));
1802 static force_inline void
1803 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1810 __m128i xmm_alpha_lo, xmm_alpha_hi;
1811 __m128i xmm_src_lo, xmm_src_hi;
1812 __m128i xmm_dst_lo, xmm_dst_hi;
1813 __m128i xmm_mask_lo, xmm_mask_hi;
1815 while (w && (unsigned long)pd & 15)
1821 *pd++ = pack_1x64_32 (
1824 pix_multiply_1x64 (unpack_32_1x64 (m),
1825 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1831 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1832 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1833 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1835 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1836 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1837 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1839 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1840 &xmm_alpha_lo, &xmm_alpha_hi);
1841 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1842 &xmm_alpha_lo, &xmm_alpha_hi,
1843 &xmm_alpha_lo, &xmm_alpha_hi);
1845 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1846 &xmm_alpha_lo, &xmm_alpha_hi,
1847 &xmm_dst_lo, &xmm_dst_hi);
1850 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1864 *pd++ = pack_1x64_32 (
1867 pix_multiply_1x64 (unpack_32_1x64 (m),
1868 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1873 static force_inline void
1874 core_combine_out_ca_sse2 (uint32_t * pd,
1881 __m128i xmm_alpha_lo, xmm_alpha_hi;
1882 __m128i xmm_src_lo, xmm_src_hi;
1883 __m128i xmm_dst_lo, xmm_dst_hi;
1884 __m128i xmm_mask_lo, xmm_mask_hi;
1886 while (w && (unsigned long)pd & 15)
1892 *pd++ = pack_1x64_32 (
1895 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1896 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1902 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1903 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1904 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1906 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1907 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1908 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1910 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1911 &xmm_alpha_lo, &xmm_alpha_hi);
1912 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1913 &xmm_alpha_lo, &xmm_alpha_hi);
1915 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1916 &xmm_mask_lo, &xmm_mask_hi,
1917 &xmm_dst_lo, &xmm_dst_hi);
1918 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1919 &xmm_alpha_lo, &xmm_alpha_hi,
1920 &xmm_dst_lo, &xmm_dst_hi);
1923 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1937 *pd++ = pack_1x64_32 (
1940 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1941 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1947 static force_inline void
1948 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
1955 __m128i xmm_alpha_lo, xmm_alpha_hi;
1956 __m128i xmm_src_lo, xmm_src_hi;
1957 __m128i xmm_dst_lo, xmm_dst_hi;
1958 __m128i xmm_mask_lo, xmm_mask_hi;
1960 while (w && (unsigned long)pd & 15)
1966 *pd++ = pack_1x64_32 (
1969 negate_1x64 (pix_multiply_1x64 (
1971 expand_alpha_1x64 (unpack_32_1x64 (s))))));
1977 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1978 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1979 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1981 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1982 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1983 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1985 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1986 &xmm_alpha_lo, &xmm_alpha_hi);
1988 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1989 &xmm_alpha_lo, &xmm_alpha_hi,
1990 &xmm_mask_lo, &xmm_mask_hi);
1992 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1993 &xmm_mask_lo, &xmm_mask_hi);
1995 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1996 &xmm_mask_lo, &xmm_mask_hi,
1997 &xmm_dst_lo, &xmm_dst_hi);
2000 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2014 *pd++ = pack_1x64_32 (
2017 negate_1x64 (pix_multiply_1x64 (
2019 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2024 static force_inline uint32_t
2025 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2029 __m64 m = unpack_32_1x64 (mask);
2030 __m64 s = unpack_32_1x64 (src);
2031 __m64 d = unpack_32_1x64 (dst);
2032 __m64 sa = expand_alpha_1x64 (s);
2033 __m64 da = expand_alpha_1x64 (d);
2035 s = pix_multiply_1x64 (s, m);
2036 m = negate_1x64 (pix_multiply_1x64 (m, sa));
2038 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2041 static force_inline void
2042 core_combine_atop_ca_sse2 (uint32_t * pd,
2049 __m128i xmm_src_lo, xmm_src_hi;
2050 __m128i xmm_dst_lo, xmm_dst_hi;
2051 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2052 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2053 __m128i xmm_mask_lo, xmm_mask_hi;
2055 while (w && (unsigned long)pd & 15)
2061 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2067 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2068 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2069 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2071 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2072 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2073 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2075 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2076 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2077 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2078 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2080 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2081 &xmm_mask_lo, &xmm_mask_hi,
2082 &xmm_src_lo, &xmm_src_hi);
2083 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2084 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2085 &xmm_mask_lo, &xmm_mask_hi);
2087 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2089 pix_add_multiply_2x128 (
2090 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092 &xmm_dst_lo, &xmm_dst_hi);
2095 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2109 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2114 static force_inline uint32_t
2115 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2119 __m64 m = unpack_32_1x64 (mask);
2120 __m64 s = unpack_32_1x64 (src);
2121 __m64 d = unpack_32_1x64 (dst);
2123 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2124 __m64 sa = expand_alpha_1x64 (s);
2126 s = pix_multiply_1x64 (s, m);
2127 m = pix_multiply_1x64 (m, sa);
2129 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2132 static force_inline void
2133 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2140 __m128i xmm_src_lo, xmm_src_hi;
2141 __m128i xmm_dst_lo, xmm_dst_hi;
2142 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2143 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2144 __m128i xmm_mask_lo, xmm_mask_hi;
2146 while (w && (unsigned long)pd & 15)
2152 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2158 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2159 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2160 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2162 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2163 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2164 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2166 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2167 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2168 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2169 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2171 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2172 &xmm_mask_lo, &xmm_mask_hi,
2173 &xmm_src_lo, &xmm_src_hi);
2174 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2175 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2176 &xmm_mask_lo, &xmm_mask_hi);
2178 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2179 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2181 pix_add_multiply_2x128 (
2182 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2183 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2184 &xmm_dst_lo, &xmm_dst_hi);
2187 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2201 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2206 static force_inline uint32_t
2207 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2211 __m64 a = unpack_32_1x64 (mask);
2212 __m64 s = unpack_32_1x64 (src);
2213 __m64 d = unpack_32_1x64 (dst);
2215 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2216 a, expand_alpha_1x64 (s)));
2217 __m64 dest = pix_multiply_1x64 (s, a);
2218 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2220 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2226 static force_inline void
2227 core_combine_xor_ca_sse2 (uint32_t * pd,
2234 __m128i xmm_src_lo, xmm_src_hi;
2235 __m128i xmm_dst_lo, xmm_dst_hi;
2236 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2237 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2238 __m128i xmm_mask_lo, xmm_mask_hi;
2240 while (w && (unsigned long)pd & 15)
2246 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2252 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2253 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2254 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2256 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2257 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2258 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2260 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2261 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2262 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2263 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2265 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2266 &xmm_mask_lo, &xmm_mask_hi,
2267 &xmm_src_lo, &xmm_src_hi);
2268 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2269 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2270 &xmm_mask_lo, &xmm_mask_hi);
2272 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2273 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2274 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2275 &xmm_mask_lo, &xmm_mask_hi);
2277 pix_add_multiply_2x128 (
2278 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2279 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2280 &xmm_dst_lo, &xmm_dst_hi);
2283 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2297 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2302 static force_inline void
2303 core_combine_add_ca_sse2 (uint32_t * pd,
2310 __m128i xmm_src_lo, xmm_src_hi;
2311 __m128i xmm_dst_lo, xmm_dst_hi;
2312 __m128i xmm_mask_lo, xmm_mask_hi;
2314 while (w && (unsigned long)pd & 15)
2320 *pd++ = pack_1x64_32 (
2321 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2322 unpack_32_1x64 (m)),
2323 unpack_32_1x64 (d)));
2329 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2330 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2331 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2333 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2334 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2335 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2337 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2338 &xmm_mask_lo, &xmm_mask_hi,
2339 &xmm_src_lo, &xmm_src_hi);
2342 (__m128i*)pd, pack_2x128_128 (
2343 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2344 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2358 *pd++ = pack_1x64_32 (
2359 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2360 unpack_32_1x64 (m)),
2361 unpack_32_1x64 (d)));
2366 /* ---------------------------------------------------
2367 * fb_compose_setup_sSE2
2369 static force_inline __m64
2370 create_mask_16_64 (uint16_t mask)
2372 return _mm_set1_pi16 (mask);
2375 static force_inline __m128i
2376 create_mask_16_128 (uint16_t mask)
2378 return _mm_set1_epi16 (mask);
2381 static force_inline __m64
2382 create_mask_2x32_64 (uint32_t mask0,
2385 return _mm_set_pi32 (mask0, mask1);
2388 /* Work around a code generation bug in Sun Studio 12. */
2389 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2390 # define create_mask_2x32_128(mask0, mask1) \
2391 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2393 static force_inline __m128i
2394 create_mask_2x32_128 (uint32_t mask0,
2397 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2401 /* SSE2 code patch for fbcompose.c */
2404 sse2_combine_over_u (pixman_implementation_t *imp,
2407 const uint32_t * src,
2408 const uint32_t * mask,
2411 core_combine_over_u_sse2 (dst, src, mask, width);
2416 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2419 const uint32_t * src,
2420 const uint32_t * mask,
2423 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2428 sse2_combine_in_u (pixman_implementation_t *imp,
2431 const uint32_t * src,
2432 const uint32_t * mask,
2435 core_combine_in_u_sse2 (dst, src, mask, width);
2440 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2443 const uint32_t * src,
2444 const uint32_t * mask,
2447 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2452 sse2_combine_out_u (pixman_implementation_t *imp,
2455 const uint32_t * src,
2456 const uint32_t * mask,
2459 core_combine_out_u_sse2 (dst, src, mask, width);
2464 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2467 const uint32_t * src,
2468 const uint32_t * mask,
2471 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2476 sse2_combine_atop_u (pixman_implementation_t *imp,
2479 const uint32_t * src,
2480 const uint32_t * mask,
2483 core_combine_atop_u_sse2 (dst, src, mask, width);
2488 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2491 const uint32_t * src,
2492 const uint32_t * mask,
2495 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2500 sse2_combine_xor_u (pixman_implementation_t *imp,
2503 const uint32_t * src,
2504 const uint32_t * mask,
2507 core_combine_xor_u_sse2 (dst, src, mask, width);
2512 sse2_combine_add_u (pixman_implementation_t *imp,
2515 const uint32_t * src,
2516 const uint32_t * mask,
2519 core_combine_add_u_sse2 (dst, src, mask, width);
2524 sse2_combine_saturate_u (pixman_implementation_t *imp,
2527 const uint32_t * src,
2528 const uint32_t * mask,
2531 core_combine_saturate_u_sse2 (dst, src, mask, width);
2536 sse2_combine_src_ca (pixman_implementation_t *imp,
2539 const uint32_t * src,
2540 const uint32_t * mask,
2543 core_combine_src_ca_sse2 (dst, src, mask, width);
2548 sse2_combine_over_ca (pixman_implementation_t *imp,
2551 const uint32_t * src,
2552 const uint32_t * mask,
2555 core_combine_over_ca_sse2 (dst, src, mask, width);
2560 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2563 const uint32_t * src,
2564 const uint32_t * mask,
2567 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2572 sse2_combine_in_ca (pixman_implementation_t *imp,
2575 const uint32_t * src,
2576 const uint32_t * mask,
2579 core_combine_in_ca_sse2 (dst, src, mask, width);
2584 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2587 const uint32_t * src,
2588 const uint32_t * mask,
2591 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2596 sse2_combine_out_ca (pixman_implementation_t *imp,
2599 const uint32_t * src,
2600 const uint32_t * mask,
2603 core_combine_out_ca_sse2 (dst, src, mask, width);
2608 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2611 const uint32_t * src,
2612 const uint32_t * mask,
2615 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2620 sse2_combine_atop_ca (pixman_implementation_t *imp,
2623 const uint32_t * src,
2624 const uint32_t * mask,
2627 core_combine_atop_ca_sse2 (dst, src, mask, width);
2632 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2635 const uint32_t * src,
2636 const uint32_t * mask,
2639 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2644 sse2_combine_xor_ca (pixman_implementation_t *imp,
2647 const uint32_t * src,
2648 const uint32_t * mask,
2651 core_combine_xor_ca_sse2 (dst, src, mask, width);
2656 sse2_combine_add_ca (pixman_implementation_t *imp,
2659 const uint32_t * src,
2660 const uint32_t * mask,
2663 core_combine_add_ca_sse2 (dst, src, mask, width);
2667 /* -------------------------------------------------------------------
2668 * composite_over_n_8888
2672 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2674 pixman_image_t * src_image,
2675 pixman_image_t * mask_image,
2676 pixman_image_t * dst_image,
2687 uint32_t *dst_line, *dst, d;
2690 __m128i xmm_src, xmm_alpha;
2691 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2693 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2698 PIXMAN_IMAGE_GET_LINE (
2699 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2701 xmm_src = expand_pixel_32_1x128 (src);
2702 xmm_alpha = expand_alpha_1x128 (xmm_src);
2708 dst_line += dst_stride;
2711 while (w && (unsigned long)dst & 15)
2714 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2715 _mm_movepi64_pi64 (xmm_alpha),
2716 unpack_32_1x64 (d)));
2722 xmm_dst = load_128_aligned ((__m128i*)dst);
2724 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2726 over_2x128 (&xmm_src, &xmm_src,
2727 &xmm_alpha, &xmm_alpha,
2728 &xmm_dst_lo, &xmm_dst_hi);
2730 /* rebuid the 4 pixel data and save*/
2732 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2741 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2742 _mm_movepi64_pi64 (xmm_alpha),
2743 unpack_32_1x64 (d)));
2751 /* ---------------------------------------------------------------------
2752 * composite_over_n_0565
2755 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2757 pixman_image_t * src_image,
2758 pixman_image_t * mask_image,
2759 pixman_image_t * dst_image,
2770 uint16_t *dst_line, *dst, d;
2773 __m128i xmm_src, xmm_alpha;
2774 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2776 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2781 PIXMAN_IMAGE_GET_LINE (
2782 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2784 xmm_src = expand_pixel_32_1x128 (src);
2785 xmm_alpha = expand_alpha_1x128 (xmm_src);
2791 dst_line += dst_stride;
2794 while (w && (unsigned long)dst & 15)
2798 *dst++ = pack_565_32_16 (
2799 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2800 _mm_movepi64_pi64 (xmm_alpha),
2801 expand565_16_1x64 (d))));
2807 xmm_dst = load_128_aligned ((__m128i*)dst);
2809 unpack_565_128_4x128 (xmm_dst,
2810 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2812 over_2x128 (&xmm_src, &xmm_src,
2813 &xmm_alpha, &xmm_alpha,
2814 &xmm_dst0, &xmm_dst1);
2815 over_2x128 (&xmm_src, &xmm_src,
2816 &xmm_alpha, &xmm_alpha,
2817 &xmm_dst2, &xmm_dst3);
2819 xmm_dst = pack_565_4x128_128 (
2820 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2822 save_128_aligned ((__m128i*)dst, xmm_dst);
2831 *dst++ = pack_565_32_16 (
2832 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2833 _mm_movepi64_pi64 (xmm_alpha),
2834 expand565_16_1x64 (d))));
2841 /* ------------------------------
2842 * composite_add_n_8888_8888_ca
2845 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2847 pixman_image_t * src_image,
2848 pixman_image_t * mask_image,
2849 pixman_image_t * dst_image,
2860 uint32_t *dst_line, d;
2861 uint32_t *mask_line, m;
2863 int dst_stride, mask_stride;
2865 __m128i xmm_src, xmm_alpha;
2867 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2869 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2871 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2877 PIXMAN_IMAGE_GET_LINE (
2878 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2879 PIXMAN_IMAGE_GET_LINE (
2880 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2882 xmm_src = _mm_unpacklo_epi8 (
2883 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2884 xmm_alpha = expand_alpha_1x128 (xmm_src);
2885 mmx_src = _mm_movepi64_pi64 (xmm_src);
2886 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2891 const uint32_t *pm = (uint32_t *)mask_line;
2892 uint32_t *pd = (uint32_t *)dst_line;
2894 dst_line += dst_stride;
2895 mask_line += mask_stride;
2897 while (w && (unsigned long)pd & 15)
2905 mmx_mask = unpack_32_1x64 (m);
2906 mmx_dest = unpack_32_1x64 (d);
2908 *pd = pack_1x64_32 (
2909 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2918 xmm_mask = load_128_unaligned ((__m128i*)pm);
2922 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2924 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2925 if (pack_cmp != 0xffff)
2927 xmm_dst = load_128_aligned ((__m128i*)pd);
2929 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2931 pix_multiply_2x128 (&xmm_src, &xmm_src,
2932 &xmm_mask_lo, &xmm_mask_hi,
2933 &xmm_mask_lo, &xmm_mask_hi);
2934 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2937 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2953 mmx_mask = unpack_32_1x64 (m);
2954 mmx_dest = unpack_32_1x64 (d);
2956 *pd = pack_1x64_32 (
2957 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2968 /* ---------------------------------------------------------------------------
2969 * composite_over_n_8888_8888_ca
2973 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2975 pixman_image_t * src_image,
2976 pixman_image_t * mask_image,
2977 pixman_image_t * dst_image,
2988 uint32_t *dst_line, d;
2989 uint32_t *mask_line, m;
2991 int dst_stride, mask_stride;
2993 __m128i xmm_src, xmm_alpha;
2994 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2995 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2997 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2999 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3004 PIXMAN_IMAGE_GET_LINE (
3005 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3006 PIXMAN_IMAGE_GET_LINE (
3007 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3009 xmm_src = _mm_unpacklo_epi8 (
3010 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3011 xmm_alpha = expand_alpha_1x128 (xmm_src);
3012 mmx_src = _mm_movepi64_pi64 (xmm_src);
3013 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3018 const uint32_t *pm = (uint32_t *)mask_line;
3019 uint32_t *pd = (uint32_t *)dst_line;
3021 dst_line += dst_stride;
3022 mask_line += mask_stride;
3024 while (w && (unsigned long)pd & 15)
3031 mmx_mask = unpack_32_1x64 (m);
3032 mmx_dest = unpack_32_1x64 (d);
3034 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3046 xmm_mask = load_128_unaligned ((__m128i*)pm);
3050 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3052 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3053 if (pack_cmp != 0xffff)
3055 xmm_dst = load_128_aligned ((__m128i*)pd);
3057 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3058 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3060 in_over_2x128 (&xmm_src, &xmm_src,
3061 &xmm_alpha, &xmm_alpha,
3062 &xmm_mask_lo, &xmm_mask_hi,
3063 &xmm_dst_lo, &xmm_dst_hi);
3066 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3081 mmx_mask = unpack_32_1x64 (m);
3082 mmx_dest = unpack_32_1x64 (d);
3084 *pd = pack_1x64_32 (
3085 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3096 /*---------------------------------------------------------------------
3097 * composite_over_8888_n_8888
3101 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3103 pixman_image_t * src_image,
3104 pixman_image_t * mask_image,
3105 pixman_image_t * dst_image,
3115 uint32_t *dst_line, *dst;
3116 uint32_t *src_line, *src;
3119 int dst_stride, src_stride;
3122 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3123 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3124 __m128i xmm_alpha_lo, xmm_alpha_hi;
3126 PIXMAN_IMAGE_GET_LINE (
3127 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3128 PIXMAN_IMAGE_GET_LINE (
3129 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3131 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3133 xmm_mask = create_mask_16_128 (mask >> 24);
3138 dst_line += dst_stride;
3140 src_line += src_stride;
3143 while (w && (unsigned long)dst & 15)
3145 uint32_t s = *src++;
3151 __m64 ms = unpack_32_1x64 (s);
3152 __m64 alpha = expand_alpha_1x64 (ms);
3153 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3154 __m64 alpha_dst = unpack_32_1x64 (d);
3156 *dst = pack_1x64_32 (
3157 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3165 xmm_src = load_128_unaligned ((__m128i*)src);
3167 if (!is_zero (xmm_src))
3169 xmm_dst = load_128_aligned ((__m128i*)dst);
3171 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3172 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3173 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3174 &xmm_alpha_lo, &xmm_alpha_hi);
3176 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3177 &xmm_alpha_lo, &xmm_alpha_hi,
3178 &xmm_mask, &xmm_mask,
3179 &xmm_dst_lo, &xmm_dst_hi);
3182 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3192 uint32_t s = *src++;
3198 __m64 ms = unpack_32_1x64 (s);
3199 __m64 alpha = expand_alpha_1x64 (ms);
3200 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3201 __m64 dest = unpack_32_1x64 (d);
3203 *dst = pack_1x64_32 (
3204 in_over_1x64 (&ms, &alpha, &mask, &dest));
3215 /*---------------------------------------------------------------------
3216 * composite_over_8888_n_8888
3220 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
3222 pixman_image_t * src_image,
3223 pixman_image_t * mask_image,
3224 pixman_image_t * dst_image,
3234 uint32_t *dst_line, *dst;
3235 uint32_t *src_line, *src;
3237 int dst_stride, src_stride;
3240 PIXMAN_IMAGE_GET_LINE (
3241 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3242 PIXMAN_IMAGE_GET_LINE (
3243 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3248 dst_line += dst_stride;
3250 src_line += src_stride;
3253 while (w && (unsigned long)dst & 15)
3255 *dst++ = *src++ | 0xff000000;
3261 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
3263 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
3264 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
3265 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
3266 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
3268 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
3269 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
3270 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
3271 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
3280 *dst++ = *src++ | 0xff000000;
3288 /* ---------------------------------------------------------------------
3289 * composite_over_x888_n_8888
3292 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3294 pixman_image_t * src_image,
3295 pixman_image_t * mask_image,
3296 pixman_image_t * dst_image,
3306 uint32_t *dst_line, *dst;
3307 uint32_t *src_line, *src;
3309 int dst_stride, src_stride;
3312 __m128i xmm_mask, xmm_alpha;
3313 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3314 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3316 PIXMAN_IMAGE_GET_LINE (
3317 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3318 PIXMAN_IMAGE_GET_LINE (
3319 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3321 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3323 xmm_mask = create_mask_16_128 (mask >> 24);
3324 xmm_alpha = mask_00ff;
3329 dst_line += dst_stride;
3331 src_line += src_stride;
3334 while (w && (unsigned long)dst & 15)
3336 uint32_t s = (*src++) | 0xff000000;
3339 __m64 src = unpack_32_1x64 (s);
3340 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3341 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3342 __m64 dest = unpack_32_1x64 (d);
3344 *dst++ = pack_1x64_32 (
3345 in_over_1x64 (&src, &alpha, &mask, &dest));
3352 xmm_src = _mm_or_si128 (
3353 load_128_unaligned ((__m128i*)src), mask_ff000000);
3354 xmm_dst = load_128_aligned ((__m128i*)dst);
3356 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3357 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3359 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3360 &xmm_alpha, &xmm_alpha,
3361 &xmm_mask, &xmm_mask,
3362 &xmm_dst_lo, &xmm_dst_hi);
3365 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3375 uint32_t s = (*src++) | 0xff000000;
3378 __m64 src = unpack_32_1x64 (s);
3379 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3380 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3381 __m64 dest = unpack_32_1x64 (d);
3383 *dst++ = pack_1x64_32 (
3384 in_over_1x64 (&src, &alpha, &mask, &dest));
3393 /* --------------------------------------------------------------------
3394 * composite_over_8888_8888
3397 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3399 pixman_image_t * src_image,
3400 pixman_image_t * mask_image,
3401 pixman_image_t * dst_image,
3411 int dst_stride, src_stride;
3412 uint32_t *dst_line, *dst;
3413 uint32_t *src_line, *src;
3415 PIXMAN_IMAGE_GET_LINE (
3416 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3417 PIXMAN_IMAGE_GET_LINE (
3418 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3425 core_combine_over_u_sse2 (dst, src, NULL, width);
3433 /* ------------------------------------------------------------------
3434 * composite_over_8888_0565
3436 static force_inline uint16_t
3437 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3441 ms = unpack_32_1x64 (src);
3442 return pack_565_32_16 (
3445 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3449 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3451 pixman_image_t * src_image,
3452 pixman_image_t * mask_image,
3453 pixman_image_t * dst_image,
3463 uint16_t *dst_line, *dst, d;
3464 uint32_t *src_line, *src, s;
3465 int dst_stride, src_stride;
3468 __m128i xmm_alpha_lo, xmm_alpha_hi;
3469 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3470 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3472 PIXMAN_IMAGE_GET_LINE (
3473 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3474 PIXMAN_IMAGE_GET_LINE (
3475 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3480 * I copy the code from MMX one and keep the fixme.
3481 * If it's a problem there, probably is a problem here.
3483 assert (src_image->drawable == mask_image->drawable);
3491 dst_line += dst_stride;
3492 src_line += src_stride;
3495 /* Align dst on a 16-byte boundary */
3497 ((unsigned long)dst & 15))
3502 *dst++ = composite_over_8888_0565pixel (s, d);
3506 /* It's a 8 pixel loop */
3509 /* I'm loading unaligned because I'm not sure
3510 * about the address alignment.
3512 xmm_src = load_128_unaligned ((__m128i*) src);
3513 xmm_dst = load_128_aligned ((__m128i*) dst);
3516 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3517 unpack_565_128_4x128 (xmm_dst,
3518 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3519 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3520 &xmm_alpha_lo, &xmm_alpha_hi);
3522 /* I'm loading next 4 pixels from memory
3523 * before to optimze the memory read.
3525 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3527 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3528 &xmm_alpha_lo, &xmm_alpha_hi,
3529 &xmm_dst0, &xmm_dst1);
3532 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3533 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3534 &xmm_alpha_lo, &xmm_alpha_hi);
3536 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3537 &xmm_alpha_lo, &xmm_alpha_hi,
3538 &xmm_dst2, &xmm_dst3);
3541 (__m128i*)dst, pack_565_4x128_128 (
3542 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3554 *dst++ = composite_over_8888_0565pixel (s, d);
3561 /* -----------------------------------------------------------------
3562 * composite_over_n_8_8888
3566 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3568 pixman_image_t * src_image,
3569 pixman_image_t * mask_image,
3570 pixman_image_t * dst_image,
3581 uint32_t *dst_line, *dst;
3582 uint8_t *mask_line, *mask;
3583 int dst_stride, mask_stride;
3587 __m128i xmm_src, xmm_alpha, xmm_def;
3588 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3589 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3591 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3593 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3599 PIXMAN_IMAGE_GET_LINE (
3600 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3601 PIXMAN_IMAGE_GET_LINE (
3602 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3604 xmm_def = create_mask_2x32_128 (src, src);
3605 xmm_src = expand_pixel_32_1x128 (src);
3606 xmm_alpha = expand_alpha_1x128 (xmm_src);
3607 mmx_src = _mm_movepi64_pi64 (xmm_src);
3608 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3613 dst_line += dst_stride;
3615 mask_line += mask_stride;
3618 while (w && (unsigned long)dst & 15)
3620 uint8_t m = *mask++;
3625 mmx_mask = expand_pixel_8_1x64 (m);
3626 mmx_dest = unpack_32_1x64 (d);
3628 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3640 m = *((uint32_t*)mask);
3642 if (srca == 0xff && m == 0xffffffff)
3644 save_128_aligned ((__m128i*)dst, xmm_def);
3648 xmm_dst = load_128_aligned ((__m128i*) dst);
3649 xmm_mask = unpack_32_1x128 (m);
3650 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3653 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3654 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3656 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3657 &xmm_mask_lo, &xmm_mask_hi);
3659 in_over_2x128 (&xmm_src, &xmm_src,
3660 &xmm_alpha, &xmm_alpha,
3661 &xmm_mask_lo, &xmm_mask_hi,
3662 &xmm_dst_lo, &xmm_dst_hi);
3665 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3675 uint8_t m = *mask++;
3680 mmx_mask = expand_pixel_8_1x64 (m);
3681 mmx_dest = unpack_32_1x64 (d);
3683 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3697 /* ----------------------------------------------------------------
3698 * composite_over_n_8_8888
3702 pixman_fill_sse2 (uint32_t *bits,
3711 uint32_t byte_width;
3721 stride = stride * (int) sizeof (uint32_t) / 1;
3722 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3728 data = (w << 16) | w;
3732 stride = stride * (int) sizeof (uint32_t) / 2;
3733 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3734 byte_width = 2 * width;
3737 data = (data & 0xffff) * 0x00010001;
3741 stride = stride * (int) sizeof (uint32_t) / 4;
3742 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3743 byte_width = 4 * width;
3751 xmm_def = create_mask_2x32_128 (data, data);
3756 uint8_t *d = byte_line;
3757 byte_line += stride;
3760 while (w >= 1 && ((unsigned long)d & 1))
3762 *(uint8_t *)d = data;
3767 while (w >= 2 && ((unsigned long)d & 3))
3769 *(uint16_t *)d = data;
3774 while (w >= 4 && ((unsigned long)d & 15))
3776 *(uint32_t *)d = data;
3784 save_128_aligned ((__m128i*)(d), xmm_def);
3785 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3786 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3787 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3788 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3789 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3790 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3791 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3799 save_128_aligned ((__m128i*)(d), xmm_def);
3800 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3801 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3802 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3810 save_128_aligned ((__m128i*)(d), xmm_def);
3811 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3819 save_128_aligned ((__m128i*)(d), xmm_def);
3827 *(uint32_t *)d = data;
3835 *(uint16_t *)d = data;
3842 *(uint8_t *)d = data;
3853 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3855 pixman_image_t * src_image,
3856 pixman_image_t * mask_image,
3857 pixman_image_t * dst_image,
3868 uint32_t *dst_line, *dst;
3869 uint8_t *mask_line, *mask;
3870 int dst_stride, mask_stride;
3874 __m128i xmm_src, xmm_def;
3875 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3877 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3882 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3883 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3884 dest_x, dest_y, width, height, 0);
3888 PIXMAN_IMAGE_GET_LINE (
3889 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3890 PIXMAN_IMAGE_GET_LINE (
3891 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3893 xmm_def = create_mask_2x32_128 (src, src);
3894 xmm_src = expand_pixel_32_1x128 (src);
3899 dst_line += dst_stride;
3901 mask_line += mask_stride;
3904 while (w && (unsigned long)dst & 15)
3906 uint8_t m = *mask++;
3910 *dst = pack_1x64_32 (
3912 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3925 m = *((uint32_t*)mask);
3927 if (srca == 0xff && m == 0xffffffff)
3929 save_128_aligned ((__m128i*)dst, xmm_def);
3933 xmm_mask = unpack_32_1x128 (m);
3934 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3937 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3939 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3940 &xmm_mask_lo, &xmm_mask_hi);
3942 pix_multiply_2x128 (&xmm_src, &xmm_src,
3943 &xmm_mask_lo, &xmm_mask_hi,
3944 &xmm_mask_lo, &xmm_mask_hi);
3947 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3951 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3961 uint8_t m = *mask++;
3965 *dst = pack_1x64_32 (
3967 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3982 /*-----------------------------------------------------------------------
3983 * composite_over_n_8_0565
3987 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3989 pixman_image_t * src_image,
3990 pixman_image_t * mask_image,
3991 pixman_image_t * dst_image,
4002 uint16_t *dst_line, *dst, d;
4003 uint8_t *mask_line, *mask;
4004 int dst_stride, mask_stride;
4007 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4009 __m128i xmm_src, xmm_alpha;
4010 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4011 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4013 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4019 PIXMAN_IMAGE_GET_LINE (
4020 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4021 PIXMAN_IMAGE_GET_LINE (
4022 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4024 xmm_src = expand_pixel_32_1x128 (src);
4025 xmm_alpha = expand_alpha_1x128 (xmm_src);
4026 mmx_src = _mm_movepi64_pi64 (xmm_src);
4027 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4032 dst_line += dst_stride;
4034 mask_line += mask_stride;
4037 while (w && (unsigned long)dst & 15)
4044 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4045 mmx_dest = expand565_16_1x64 (d);
4047 *dst = pack_565_32_16 (
4050 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4059 xmm_dst = load_128_aligned ((__m128i*) dst);
4060 unpack_565_128_4x128 (xmm_dst,
4061 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4063 m = *((uint32_t*)mask);
4068 xmm_mask = unpack_32_1x128 (m);
4069 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4072 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4074 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4075 &xmm_mask_lo, &xmm_mask_hi);
4077 in_over_2x128 (&xmm_src, &xmm_src,
4078 &xmm_alpha, &xmm_alpha,
4079 &xmm_mask_lo, &xmm_mask_hi,
4080 &xmm_dst0, &xmm_dst1);
4083 m = *((uint32_t*)mask);
4088 xmm_mask = unpack_32_1x128 (m);
4089 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4092 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4094 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4095 &xmm_mask_lo, &xmm_mask_hi);
4096 in_over_2x128 (&xmm_src, &xmm_src,
4097 &xmm_alpha, &xmm_alpha,
4098 &xmm_mask_lo, &xmm_mask_hi,
4099 &xmm_dst2, &xmm_dst3);
4103 (__m128i*)dst, pack_565_4x128_128 (
4104 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4117 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4118 mmx_dest = expand565_16_1x64 (d);
4120 *dst = pack_565_32_16 (
4123 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4134 /* -----------------------------------------------------------------------
4135 * composite_over_pixbuf_0565
4139 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4141 pixman_image_t * src_image,
4142 pixman_image_t * mask_image,
4143 pixman_image_t * dst_image,
4153 uint16_t *dst_line, *dst, d;
4154 uint32_t *src_line, *src, s;
4155 int dst_stride, src_stride;
4157 uint32_t opaque, zero;
4160 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4161 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4163 PIXMAN_IMAGE_GET_LINE (
4164 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4165 PIXMAN_IMAGE_GET_LINE (
4166 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4171 * I copy the code from MMX one and keep the fixme.
4172 * If it's a problem there, probably is a problem here.
4174 assert (src_image->drawable == mask_image->drawable);
4180 dst_line += dst_stride;
4182 src_line += src_stride;
4185 while (w && (unsigned long)dst & 15)
4190 ms = unpack_32_1x64 (s);
4192 *dst++ = pack_565_32_16 (
4194 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4201 xmm_src = load_128_unaligned ((__m128i*)src);
4202 xmm_dst = load_128_aligned ((__m128i*)dst);
4204 opaque = is_opaque (xmm_src);
4205 zero = is_zero (xmm_src);
4207 unpack_565_128_4x128 (xmm_dst,
4208 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4209 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4211 /* preload next round*/
4212 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4216 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4217 &xmm_dst0, &xmm_dst1);
4221 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4222 &xmm_dst0, &xmm_dst1);
4226 opaque = is_opaque (xmm_src);
4227 zero = is_zero (xmm_src);
4229 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4233 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4234 &xmm_dst2, &xmm_dst3);
4238 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4239 &xmm_dst2, &xmm_dst3);
4243 (__m128i*)dst, pack_565_4x128_128 (
4244 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4256 ms = unpack_32_1x64 (s);
4258 *dst++ = pack_565_32_16 (
4260 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4268 /* -------------------------------------------------------------------------
4269 * composite_over_pixbuf_8888
4273 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4275 pixman_image_t * src_image,
4276 pixman_image_t * mask_image,
4277 pixman_image_t * dst_image,
4287 uint32_t *dst_line, *dst, d;
4288 uint32_t *src_line, *src, s;
4289 int dst_stride, src_stride;
4291 uint32_t opaque, zero;
4293 __m128i xmm_src_lo, xmm_src_hi;
4294 __m128i xmm_dst_lo, xmm_dst_hi;
4296 PIXMAN_IMAGE_GET_LINE (
4297 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4298 PIXMAN_IMAGE_GET_LINE (
4299 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4304 * I copy the code from MMX one and keep the fixme.
4305 * If it's a problem there, probably is a problem here.
4307 assert (src_image->drawable == mask_image->drawable);
4313 dst_line += dst_stride;
4315 src_line += src_stride;
4318 while (w && (unsigned long)dst & 15)
4323 *dst++ = pack_1x64_32 (
4324 over_rev_non_pre_1x64 (
4325 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4332 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4334 opaque = is_opaque (xmm_src_hi);
4335 zero = is_zero (xmm_src_hi);
4337 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4341 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4342 &xmm_dst_lo, &xmm_dst_hi);
4345 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4349 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4351 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4353 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4354 &xmm_dst_lo, &xmm_dst_hi);
4357 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4370 *dst++ = pack_1x64_32 (
4371 over_rev_non_pre_1x64 (
4372 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4381 /* -------------------------------------------------------------------------------------------------
4382 * composite_over_n_8888_0565_ca
4386 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4388 pixman_image_t * src_image,
4389 pixman_image_t * mask_image,
4390 pixman_image_t * dst_image,
4401 uint16_t *dst_line, *dst, d;
4402 uint32_t *mask_line, *mask, m;
4403 int dst_stride, mask_stride;
4407 __m128i xmm_src, xmm_alpha;
4408 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4409 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4411 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4413 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4418 PIXMAN_IMAGE_GET_LINE (
4419 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4420 PIXMAN_IMAGE_GET_LINE (
4421 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4423 xmm_src = expand_pixel_32_1x128 (src);
4424 xmm_alpha = expand_alpha_1x128 (xmm_src);
4425 mmx_src = _mm_movepi64_pi64 (xmm_src);
4426 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4433 mask_line += mask_stride;
4434 dst_line += dst_stride;
4436 while (w && ((unsigned long)dst & 15))
4438 m = *(uint32_t *) mask;
4443 mmx_mask = unpack_32_1x64 (m);
4444 mmx_dest = expand565_16_1x64 (d);
4446 *dst = pack_565_32_16 (
4449 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4460 xmm_mask = load_128_unaligned ((__m128i*)mask);
4461 xmm_dst = load_128_aligned ((__m128i*)dst);
4463 pack_cmp = _mm_movemask_epi8 (
4464 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4466 unpack_565_128_4x128 (xmm_dst,
4467 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4468 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4470 /* preload next round */
4471 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4473 /* preload next round */
4474 if (pack_cmp != 0xffff)
4476 in_over_2x128 (&xmm_src, &xmm_src,
4477 &xmm_alpha, &xmm_alpha,
4478 &xmm_mask_lo, &xmm_mask_hi,
4479 &xmm_dst0, &xmm_dst1);
4483 pack_cmp = _mm_movemask_epi8 (
4484 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4486 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4488 if (pack_cmp != 0xffff)
4490 in_over_2x128 (&xmm_src, &xmm_src,
4491 &xmm_alpha, &xmm_alpha,
4492 &xmm_mask_lo, &xmm_mask_hi,
4493 &xmm_dst2, &xmm_dst3);
4497 (__m128i*)dst, pack_565_4x128_128 (
4498 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4507 m = *(uint32_t *) mask;
4512 mmx_mask = unpack_32_1x64 (m);
4513 mmx_dest = expand565_16_1x64 (d);
4515 *dst = pack_565_32_16 (
4518 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4530 /* -----------------------------------------------------------------------
4531 * composite_in_n_8_8
4535 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4537 pixman_image_t * src_image,
4538 pixman_image_t * mask_image,
4539 pixman_image_t * dst_image,
4549 uint8_t *dst_line, *dst;
4550 uint8_t *mask_line, *mask;
4551 int dst_stride, mask_stride;
4558 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4559 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4561 PIXMAN_IMAGE_GET_LINE (
4562 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4563 PIXMAN_IMAGE_GET_LINE (
4564 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4566 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4570 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4575 dst_line += dst_stride;
4577 mask_line += mask_stride;
4580 while (w && ((unsigned long)dst & 15))
4582 m = (uint32_t) *mask++;
4583 d = (uint32_t) *dst;
4585 *dst++ = (uint8_t) pack_1x64_32 (
4587 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4588 unpack_32_1x64 (m)),
4589 unpack_32_1x64 (d)));
4595 xmm_mask = load_128_unaligned ((__m128i*)mask);
4596 xmm_dst = load_128_aligned ((__m128i*)dst);
4598 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4599 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4601 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4602 &xmm_mask_lo, &xmm_mask_hi,
4603 &xmm_mask_lo, &xmm_mask_hi);
4605 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4606 &xmm_dst_lo, &xmm_dst_hi,
4607 &xmm_dst_lo, &xmm_dst_hi);
4610 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4619 m = (uint32_t) *mask++;
4620 d = (uint32_t) *dst;
4622 *dst++ = (uint8_t) pack_1x64_32 (
4625 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4626 unpack_32_1x64 (d)));
4634 /* -----------------------------------------------------------------------
4639 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4641 pixman_image_t * src_image,
4642 pixman_image_t * mask_image,
4643 pixman_image_t * dst_image,
4653 uint8_t *dst_line, *dst;
4660 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4662 PIXMAN_IMAGE_GET_LINE (
4663 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4665 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4667 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4676 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4677 8, dest_x, dest_y, width, height, src);
4685 dst_line += dst_stride;
4688 while (w && ((unsigned long)dst & 15))
4690 d = (uint32_t) *dst;
4692 *dst++ = (uint8_t) pack_1x64_32 (
4694 _mm_movepi64_pi64 (xmm_alpha),
4695 unpack_32_1x64 (d)));
4701 xmm_dst = load_128_aligned ((__m128i*)dst);
4703 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4705 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4706 &xmm_dst_lo, &xmm_dst_hi,
4707 &xmm_dst_lo, &xmm_dst_hi);
4710 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4718 d = (uint32_t) *dst;
4720 *dst++ = (uint8_t) pack_1x64_32 (
4722 _mm_movepi64_pi64 (xmm_alpha),
4723 unpack_32_1x64 (d)));
4731 /* ---------------------------------------------------------------------------
4736 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4738 pixman_image_t * src_image,
4739 pixman_image_t * mask_image,
4740 pixman_image_t * dst_image,
4750 uint8_t *dst_line, *dst;
4751 uint8_t *src_line, *src;
4752 int src_stride, dst_stride;
4756 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4757 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4759 PIXMAN_IMAGE_GET_LINE (
4760 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4761 PIXMAN_IMAGE_GET_LINE (
4762 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4767 dst_line += dst_stride;
4769 src_line += src_stride;
4772 while (w && ((unsigned long)dst & 15))
4774 s = (uint32_t) *src++;
4775 d = (uint32_t) *dst;
4777 *dst++ = (uint8_t) pack_1x64_32 (
4779 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4785 xmm_src = load_128_unaligned ((__m128i*)src);
4786 xmm_dst = load_128_aligned ((__m128i*)dst);
4788 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4789 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4791 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4792 &xmm_dst_lo, &xmm_dst_hi,
4793 &xmm_dst_lo, &xmm_dst_hi);
4796 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4805 s = (uint32_t) *src++;
4806 d = (uint32_t) *dst;
4808 *dst++ = (uint8_t) pack_1x64_32 (
4809 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4817 /* -------------------------------------------------------------------------
4818 * composite_add_n_8_8
4822 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4824 pixman_image_t * src_image,
4825 pixman_image_t * mask_image,
4826 pixman_image_t * dst_image,
4836 uint8_t *dst_line, *dst;
4837 uint8_t *mask_line, *mask;
4838 int dst_stride, mask_stride;
4845 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4846 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4848 PIXMAN_IMAGE_GET_LINE (
4849 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4850 PIXMAN_IMAGE_GET_LINE (
4851 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4853 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4857 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4862 dst_line += dst_stride;
4864 mask_line += mask_stride;
4867 while (w && ((unsigned long)dst & 15))
4869 m = (uint32_t) *mask++;
4870 d = (uint32_t) *dst;
4872 *dst++ = (uint8_t) pack_1x64_32 (
4875 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4876 unpack_32_1x64 (d)));
4882 xmm_mask = load_128_unaligned ((__m128i*)mask);
4883 xmm_dst = load_128_aligned ((__m128i*)dst);
4885 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4886 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4888 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4889 &xmm_mask_lo, &xmm_mask_hi,
4890 &xmm_mask_lo, &xmm_mask_hi);
4892 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4893 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4896 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4905 m = (uint32_t) *mask++;
4906 d = (uint32_t) *dst;
4908 *dst++ = (uint8_t) pack_1x64_32 (
4911 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4912 unpack_32_1x64 (d)));
4921 /* -------------------------------------------------------------------------
4922 * composite_add_n_8_8
4926 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4928 pixman_image_t * src_image,
4929 pixman_image_t * mask_image,
4930 pixman_image_t * dst_image,
4940 uint8_t *dst_line, *dst;
4947 PIXMAN_IMAGE_GET_LINE (
4948 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4950 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4959 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4960 8, dest_x, dest_y, width, height, 0xff);
4965 src = (src << 24) | (src << 16) | (src << 8) | src;
4966 xmm_src = _mm_set_epi32 (src, src, src, src);
4971 dst_line += dst_stride;
4974 while (w && ((unsigned long)dst & 15))
4976 *dst = (uint8_t)_mm_cvtsi64_si32 (
4978 _mm_movepi64_pi64 (xmm_src),
4979 _mm_cvtsi32_si64 (*dst)));
4988 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4996 *dst = (uint8_t)_mm_cvtsi64_si32 (
4998 _mm_movepi64_pi64 (xmm_src),
4999 _mm_cvtsi32_si64 (*dst)));
5009 /* ----------------------------------------------------------------------
5014 sse2_composite_add_8_8 (pixman_implementation_t *imp,
5016 pixman_image_t * src_image,
5017 pixman_image_t * mask_image,
5018 pixman_image_t * dst_image,
5028 uint8_t *dst_line, *dst;
5029 uint8_t *src_line, *src;
5030 int dst_stride, src_stride;
5034 PIXMAN_IMAGE_GET_LINE (
5035 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5036 PIXMAN_IMAGE_GET_LINE (
5037 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5044 dst_line += dst_stride;
5045 src_line += src_stride;
5049 while (w && (unsigned long)dst & 3)
5051 t = (*dst) + (*src++);
5052 *dst++ = t | (0 - (t >> 8));
5056 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5066 t = (*dst) + (*src++);
5067 *dst++ = t | (0 - (t >> 8));
5075 /* ---------------------------------------------------------------------
5076 * composite_add_8888_8888
5079 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5081 pixman_image_t * src_image,
5082 pixman_image_t * mask_image,
5083 pixman_image_t * dst_image,
5093 uint32_t *dst_line, *dst;
5094 uint32_t *src_line, *src;
5095 int dst_stride, src_stride;
5097 PIXMAN_IMAGE_GET_LINE (
5098 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5099 PIXMAN_IMAGE_GET_LINE (
5100 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5105 dst_line += dst_stride;
5107 src_line += src_stride;
5109 core_combine_add_u_sse2 (dst, src, NULL, width);
5115 /* -------------------------------------------------------------------------------------------------
5116 * sse2_composite_copy_area
5119 static pixman_bool_t
5120 pixman_blt_sse2 (uint32_t *src_bits,
5133 uint8_t * src_bytes;
5134 uint8_t * dst_bytes;
5137 if (src_bpp != dst_bpp)
5142 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5143 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5144 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5145 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5146 byte_width = 2 * width;
5150 else if (src_bpp == 32)
5152 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5153 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5154 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5155 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5156 byte_width = 4 * width;
5168 uint8_t *s = src_bytes;
5169 uint8_t *d = dst_bytes;
5170 src_bytes += src_stride;
5171 dst_bytes += dst_stride;
5174 while (w >= 2 && ((unsigned long)d & 3))
5176 *(uint16_t *)d = *(uint16_t *)s;
5182 while (w >= 4 && ((unsigned long)d & 15))
5184 *(uint32_t *)d = *(uint32_t *)s;
5193 __m128i xmm0, xmm1, xmm2, xmm3;
5195 xmm0 = load_128_unaligned ((__m128i*)(s));
5196 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5197 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5198 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5200 save_128_aligned ((__m128i*)(d), xmm0);
5201 save_128_aligned ((__m128i*)(d + 16), xmm1);
5202 save_128_aligned ((__m128i*)(d + 32), xmm2);
5203 save_128_aligned ((__m128i*)(d + 48), xmm3);
5212 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5221 *(uint32_t *)d = *(uint32_t *)s;
5230 *(uint16_t *)d = *(uint16_t *)s;
5243 sse2_composite_copy_area (pixman_implementation_t *imp,
5245 pixman_image_t * src_image,
5246 pixman_image_t * mask_image,
5247 pixman_image_t * dst_image,
5257 pixman_blt_sse2 (src_image->bits.bits,
5258 dst_image->bits.bits,
5259 src_image->bits.rowstride,
5260 dst_image->bits.rowstride,
5261 PIXMAN_FORMAT_BPP (src_image->bits.format),
5262 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5263 src_x, src_y, dest_x, dest_y, width, height);
5267 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5269 pixman_image_t * src_image,
5270 pixman_image_t * mask_image,
5271 pixman_image_t * dst_image,
5281 uint32_t *src, *src_line, s;
5282 uint32_t *dst, *dst_line, d;
5283 uint8_t *mask, *mask_line;
5285 int src_stride, mask_stride, dst_stride;
5289 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5290 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5291 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5293 PIXMAN_IMAGE_GET_LINE (
5294 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5295 PIXMAN_IMAGE_GET_LINE (
5296 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5297 PIXMAN_IMAGE_GET_LINE (
5298 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5303 src_line += src_stride;
5305 dst_line += dst_stride;
5307 mask_line += mask_stride;
5311 while (w && (unsigned long)dst & 15)
5313 s = 0xff000000 | *src++;
5314 m = (uint32_t) *mask++;
5316 ms = unpack_32_1x64 (s);
5320 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5321 __m64 md = unpack_32_1x64 (d);
5323 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5326 *dst++ = pack_1x64_32 (ms);
5332 m = *(uint32_t*) mask;
5333 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5335 if (m == 0xffffffff)
5337 save_128_aligned ((__m128i*)dst, xmm_src);
5341 xmm_dst = load_128_aligned ((__m128i*)dst);
5343 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5345 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5346 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5347 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5349 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5351 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5353 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5364 m = (uint32_t) *mask++;
5368 s = 0xff000000 | *src;
5380 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5381 md = unpack_32_1x64 (d);
5382 ms = unpack_32_1x64 (s);
5384 *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5399 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5401 pixman_image_t * src_image,
5402 pixman_image_t * mask_image,
5403 pixman_image_t * dst_image,
5413 uint32_t *src, *src_line, s;
5414 uint32_t *dst, *dst_line, d;
5415 uint8_t *mask, *mask_line;
5417 int src_stride, mask_stride, dst_stride;
5420 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5421 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5422 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5424 PIXMAN_IMAGE_GET_LINE (
5425 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5426 PIXMAN_IMAGE_GET_LINE (
5427 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5428 PIXMAN_IMAGE_GET_LINE (
5429 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5434 src_line += src_stride;
5436 dst_line += dst_stride;
5438 mask_line += mask_stride;
5442 while (w && (unsigned long)dst & 15)
5447 m = (uint32_t) *mask++;
5454 if (sa == 0xff && m == 0xff)
5460 __m64 ms, md, ma, msa;
5462 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5463 ms = unpack_32_1x64 (s);
5464 md = unpack_32_1x64 (d);
5466 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5468 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5478 m = *(uint32_t *) mask;
5482 xmm_src = load_128_unaligned ((__m128i*)src);
5484 if (m == 0xffffffff && is_opaque (xmm_src))
5486 save_128_aligned ((__m128i *)dst, xmm_src);
5490 xmm_dst = load_128_aligned ((__m128i *)dst);
5492 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5494 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5495 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5496 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5498 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5499 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5501 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5502 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5504 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5519 m = (uint32_t) *mask++;
5526 if (sa == 0xff && m == 0xff)
5532 __m64 ms, md, ma, msa;
5534 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5535 ms = unpack_32_1x64 (s);
5536 md = unpack_32_1x64 (d);
5538 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5540 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5553 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5555 pixman_image_t * src_image,
5556 pixman_image_t * mask_image,
5557 pixman_image_t * dst_image,
5568 uint32_t *dst_line, *dst;
5570 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5571 __m128i xmm_dsta_hi, xmm_dsta_lo;
5575 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5580 PIXMAN_IMAGE_GET_LINE (
5581 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5583 xmm_src = expand_pixel_32_1x128 (src);
5589 dst_line += dst_stride;
5592 while (w && (unsigned long)dst & 15)
5596 vd = unpack_32_1x64 (*dst);
5598 *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5599 _mm_movepi64_pi64 (xmm_src)));
5606 __m128i tmp_lo, tmp_hi;
5608 xmm_dst = load_128_aligned ((__m128i*)dst);
5610 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5611 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5616 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5617 &xmm_dsta_lo, &xmm_dsta_hi,
5621 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5631 vd = unpack_32_1x64 (*dst);
5633 *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5634 _mm_movepi64_pi64 (xmm_src)));
5645 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5647 pixman_image_t * src_image,
5648 pixman_image_t * mask_image,
5649 pixman_image_t * dst_image,
5659 uint32_t *src, *src_line, s;
5660 uint32_t *dst, *dst_line, d;
5661 uint32_t *mask, *mask_line;
5663 int src_stride, mask_stride, dst_stride;
5666 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5667 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5668 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5670 PIXMAN_IMAGE_GET_LINE (
5671 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5672 PIXMAN_IMAGE_GET_LINE (
5673 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5674 PIXMAN_IMAGE_GET_LINE (
5675 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5680 src_line += src_stride;
5682 dst_line += dst_stride;
5684 mask_line += mask_stride;
5688 while (w && (unsigned long)dst & 15)
5693 m = (*mask++) >> 24;
5700 if (sa == 0xff && m == 0xff)
5706 __m64 ms, md, ma, msa;
5708 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5709 ms = unpack_32_1x64 (s);
5710 md = unpack_32_1x64 (d);
5712 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5714 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5724 xmm_mask = load_128_unaligned ((__m128i*)mask);
5726 if (!is_transparent (xmm_mask))
5728 xmm_src = load_128_unaligned ((__m128i*)src);
5730 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5732 save_128_aligned ((__m128i *)dst, xmm_src);
5736 xmm_dst = load_128_aligned ((__m128i *)dst);
5738 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5739 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5740 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5742 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5743 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5745 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5746 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5748 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5763 m = (*mask++) >> 24;
5770 if (sa == 0xff && m == 0xff)
5776 __m64 ms, md, ma, msa;
5778 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5779 ms = unpack_32_1x64 (s);
5780 md = unpack_32_1x64 (d);
5782 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5784 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5796 /* A variant of 'core_combine_over_u_sse2' with minor tweaks */
5797 static force_inline void
5798 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5802 pixman_fixed_t unit_x,
5803 pixman_fixed_t max_vx)
5806 const uint32_t* pm = NULL;
5808 __m128i xmm_dst_lo, xmm_dst_hi;
5809 __m128i xmm_src_lo, xmm_src_hi;
5810 __m128i xmm_alpha_lo, xmm_alpha_hi;
5812 /* Align dst on a 16-byte boundary */
5813 while (w && ((unsigned long)pd & 15))
5816 s = combine1 (ps + (vx >> 16), pm);
5819 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5828 uint32_t tmp1, tmp2, tmp3, tmp4;
5830 tmp1 = ps[vx >> 16];
5832 tmp2 = ps[vx >> 16];
5834 tmp3 = ps[vx >> 16];
5836 tmp4 = ps[vx >> 16];
5839 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5841 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5843 if (is_opaque (xmm_src_hi))
5845 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5847 else if (!is_zero (xmm_src_hi))
5849 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5851 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5852 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5854 expand_alpha_2x128 (
5855 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5857 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5858 &xmm_alpha_lo, &xmm_alpha_hi,
5859 &xmm_dst_lo, &xmm_dst_hi);
5861 /* rebuid the 4 pixel data and save*/
5862 save_128_aligned ((__m128i*)pd,
5863 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5875 s = combine1 (ps + (vx >> 16), pm);
5878 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5887 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5888 scaled_nearest_scanline_sse2_8888_8888_OVER,
5889 uint32_t, uint32_t, COVER)
5890 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5891 scaled_nearest_scanline_sse2_8888_8888_OVER,
5892 uint32_t, uint32_t, NONE)
5893 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5894 scaled_nearest_scanline_sse2_8888_8888_OVER,
5895 uint32_t, uint32_t, PAD)
5897 static const pixman_fast_path_t sse2_fast_paths[] =
5899 /* PIXMAN_OP_OVER */
5900 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5901 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5902 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5903 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5904 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5905 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5906 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5907 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5908 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5909 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5910 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5911 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5912 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5913 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5914 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5915 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5916 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5917 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5918 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5919 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5920 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5921 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5922 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5923 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5924 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5925 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5926 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5927 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5928 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5929 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5930 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5931 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5932 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5933 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5934 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5935 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5936 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5937 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5938 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5939 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5940 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5941 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5942 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5943 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5944 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5945 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5947 /* PIXMAN_OP_OVER_REVERSE */
5948 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5949 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5952 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5953 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5954 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5955 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5956 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5957 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5960 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5961 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5962 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5963 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5964 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5965 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5966 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5967 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5968 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5969 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5970 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5971 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5972 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5973 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5976 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5977 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5978 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5980 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5981 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5982 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5983 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5984 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5985 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5986 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5987 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5988 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5989 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5990 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5991 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5996 static pixman_bool_t
5997 sse2_blt (pixman_implementation_t *imp,
5998 uint32_t * src_bits,
5999 uint32_t * dst_bits,
6011 if (!pixman_blt_sse2 (
6012 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6013 src_x, src_y, dst_x, dst_y, width, height))
6016 return _pixman_implementation_blt (
6018 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6019 src_x, src_y, dst_x, dst_y, width, height);
6025 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6026 __attribute__((__force_align_arg_pointer__))
6028 static pixman_bool_t
6029 sse2_fill (pixman_implementation_t *imp,
6039 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
6041 return _pixman_implementation_fill (
6042 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
6049 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6051 int w = iter->width;
6052 __m128i ff000000 = mask_ff000000;
6053 uint32_t *dst = iter->buffer;
6054 uint32_t *src = (uint32_t *)iter->bits;
6056 iter->bits += iter->stride;
6058 while (w && ((unsigned long)dst) & 0x0f)
6060 *dst++ = (*src++) | 0xff000000;
6067 (__m128i *)dst, _mm_or_si128 (
6068 load_128_unaligned ((__m128i *)src), ff000000));
6077 *dst++ = (*src++) | 0xff000000;
6081 return iter->buffer;
6085 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6087 int w = iter->width;
6088 uint32_t *dst = iter->buffer;
6089 uint8_t *src = iter->bits;
6090 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6092 iter->bits += iter->stride;
6094 while (w && (((unsigned long)dst) & 15))
6096 *dst++ = *(src++) << 24;
6102 xmm0 = _mm_loadu_si128((__m128i *)src);
6104 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
6105 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
6106 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6107 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6108 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6109 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6111 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
6112 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
6113 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
6114 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6123 *dst++ = *(src++) << 24;
6127 return iter->buffer;
6132 pixman_format_code_t format;
6133 pixman_iter_get_scanline_t get_scanline;
6136 static const fetcher_info_t fetchers[] =
6138 { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
6139 { PIXMAN_a8, sse2_fetch_a8 },
6144 sse2_src_iter_init (pixman_implementation_t *imp,
6145 pixman_iter_t *iter,
6146 pixman_image_t *image,
6147 int x, int y, int width, int height,
6148 uint8_t *buffer, iter_flags_t flags)
6151 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
6153 if ((flags & ITER_NARROW) &&
6154 (image->common.flags & FLAGS) == FLAGS &&
6156 x + width <= image->bits.width &&
6157 y + height <= image->bits.height)
6159 const fetcher_info_t *f;
6161 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6163 if (image->common.extended_format_code == f->format)
6165 uint8_t *b = (uint8_t *)image->bits.bits;
6166 int s = image->bits.rowstride * 4;
6168 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6170 iter->width = width;
6171 iter->buffer = (uint32_t *)buffer;
6173 iter->get_scanline = f->get_scanline;
6179 _pixman_implementation_src_iter_init (
6180 imp->delegate, iter, image, x, y, width, height, buffer, flags);
6183 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6184 __attribute__((__force_align_arg_pointer__))
6186 pixman_implementation_t *
6187 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6189 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6191 /* SSE2 constants */
6192 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6193 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6194 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6195 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6196 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6197 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6198 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6199 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6200 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6201 mask_0080 = create_mask_16_128 (0x0080);
6202 mask_00ff = create_mask_16_128 (0x00ff);
6203 mask_0101 = create_mask_16_128 (0x0101);
6204 mask_ffff = create_mask_16_128 (0xffff);
6205 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6206 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6209 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
6210 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
6212 mask_x0080 = create_mask_16_64 (0x0080);
6213 mask_x00ff = create_mask_16_64 (0x00ff);
6214 mask_x0101 = create_mask_16_64 (0x0101);
6215 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
6219 /* Set up function pointers */
6221 /* SSE code patch for fbcompose.c */
6222 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6223 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6224 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6225 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6226 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6227 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6228 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6229 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6230 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6231 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6233 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6235 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6236 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6237 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6238 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6239 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6240 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6241 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6242 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6243 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6244 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6245 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6247 imp->blt = sse2_blt;
6248 imp->fill = sse2_fill;
6250 imp->src_iter_init = sse2_src_iter_init;
6255 #endif /* USE_SSE2 */