2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38 #include "pixman-fast-path.h"
40 #if defined(_MSC_VER) && defined(_M_AMD64)
41 /* Windows 64 doesn't allow MMX to be used, so
42 * the pixman-x64-mmx-emulation.h file contains
43 * implementations of those MMX intrinsics that
44 * are used in the SSE2 implementation.
46 # include "pixman-x64-mmx-emulation.h"
51 /* --------------------------------------------------------------------
55 static __m64 mask_x0080;
56 static __m64 mask_x00ff;
57 static __m64 mask_x0101;
58 static __m64 mask_x_alpha;
60 static __m64 mask_x565_rgb;
61 static __m64 mask_x565_unpack;
63 static __m128i mask_0080;
64 static __m128i mask_00ff;
65 static __m128i mask_0101;
66 static __m128i mask_ffff;
67 static __m128i mask_ff000000;
68 static __m128i mask_alpha;
70 static __m128i mask_565_r;
71 static __m128i mask_565_g1, mask_565_g2;
72 static __m128i mask_565_b;
73 static __m128i mask_red;
74 static __m128i mask_green;
75 static __m128i mask_blue;
77 static __m128i mask_565_fix_rb;
78 static __m128i mask_565_fix_g;
80 /* ----------------------------------------------------------------------
83 static force_inline __m128i
84 unpack_32_1x128 (uint32_t data)
86 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
89 static force_inline void
90 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
92 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
93 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
96 static force_inline __m128i
97 unpack_565_to_8888 (__m128i lo)
99 __m128i r, g, b, rb, t;
101 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
102 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
103 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
105 rb = _mm_or_si128 (r, b);
106 t = _mm_and_si128 (rb, mask_565_fix_rb);
107 t = _mm_srli_epi32 (t, 5);
108 rb = _mm_or_si128 (rb, t);
110 t = _mm_and_si128 (g, mask_565_fix_g);
111 t = _mm_srli_epi32 (t, 6);
112 g = _mm_or_si128 (g, t);
114 return _mm_or_si128 (rb, g);
117 static force_inline void
118 unpack_565_128_4x128 (__m128i data,
126 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
127 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
129 lo = unpack_565_to_8888 (lo);
130 hi = unpack_565_to_8888 (hi);
132 unpack_128_2x128 (lo, data0, data1);
133 unpack_128_2x128 (hi, data2, data3);
136 static force_inline uint16_t
137 pack_565_32_16 (uint32_t pixel)
139 return (uint16_t) (((pixel >> 8) & 0xf800) |
140 ((pixel >> 5) & 0x07e0) |
141 ((pixel >> 3) & 0x001f));
144 static force_inline __m128i
145 pack_2x128_128 (__m128i lo, __m128i hi)
147 return _mm_packus_epi16 (lo, hi);
150 static force_inline __m128i
151 pack_565_2x128_128 (__m128i lo, __m128i hi)
154 __m128i r, g1, g2, b;
156 data = pack_2x128_128 (lo, hi);
158 r = _mm_and_si128 (data, mask_565_r);
159 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
160 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
161 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
163 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
166 static force_inline __m128i
167 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
169 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
170 pack_565_2x128_128 (*xmm2, *xmm3));
173 static force_inline int
174 is_opaque (__m128i x)
176 __m128i ffs = _mm_cmpeq_epi8 (x, x);
178 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
181 static force_inline int
184 return _mm_movemask_epi8 (
185 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
188 static force_inline int
189 is_transparent (__m128i x)
191 return (_mm_movemask_epi8 (
192 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
195 static force_inline __m128i
196 expand_pixel_32_1x128 (uint32_t data)
198 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
201 static force_inline __m128i
202 expand_alpha_1x128 (__m128i data)
204 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
205 _MM_SHUFFLE (3, 3, 3, 3)),
206 _MM_SHUFFLE (3, 3, 3, 3));
209 static force_inline void
210 expand_alpha_2x128 (__m128i data_lo,
217 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
218 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
221 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
224 static force_inline void
225 expand_alpha_rev_2x128 (__m128i data_lo,
232 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
233 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
235 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
238 static force_inline void
239 pix_multiply_2x128 (__m128i* data_lo,
248 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
249 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
250 lo = _mm_adds_epu16 (lo, mask_0080);
251 hi = _mm_adds_epu16 (hi, mask_0080);
252 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
253 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
256 static force_inline void
257 pix_add_multiply_2x128 (__m128i* src_lo,
259 __m128i* alpha_dst_lo,
260 __m128i* alpha_dst_hi,
263 __m128i* alpha_src_lo,
264 __m128i* alpha_src_hi,
268 __m128i t1_lo, t1_hi;
269 __m128i t2_lo, t2_hi;
271 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
272 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
274 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
275 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
278 static force_inline void
279 negate_2x128 (__m128i data_lo,
284 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
285 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
288 static force_inline void
289 invert_colors_2x128 (__m128i data_lo,
296 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
297 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
298 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
299 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
302 static force_inline void
303 over_2x128 (__m128i* src_lo,
312 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
314 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
316 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
317 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
320 static force_inline void
321 over_rev_non_pre_2x128 (__m128i src_lo,
327 __m128i alpha_lo, alpha_hi;
329 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
331 lo = _mm_or_si128 (alpha_lo, mask_alpha);
332 hi = _mm_or_si128 (alpha_hi, mask_alpha);
334 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
336 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
338 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
341 static force_inline void
342 in_over_2x128 (__m128i* src_lo,
354 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
355 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
357 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
360 /* load 4 pixels from a 16-byte boundary aligned address */
361 static force_inline __m128i
362 load_128_aligned (__m128i* src)
364 return _mm_load_si128 (src);
367 /* load 4 pixels from a unaligned address */
368 static force_inline __m128i
369 load_128_unaligned (const __m128i* src)
371 return _mm_loadu_si128 (src);
374 /* save 4 pixels using Write Combining memory on a 16-byte
375 * boundary aligned address
377 static force_inline void
378 save_128_write_combining (__m128i* dst,
381 _mm_stream_si128 (dst, data);
384 /* save 4 pixels on a 16-byte boundary aligned address */
385 static force_inline void
386 save_128_aligned (__m128i* dst,
389 _mm_store_si128 (dst, data);
392 /* save 4 pixels on a unaligned address */
393 static force_inline void
394 save_128_unaligned (__m128i* dst,
397 _mm_storeu_si128 (dst, data);
400 /* ------------------------------------------------------------------
404 static force_inline __m64
405 load_32_1x64 (uint32_t data)
407 return _mm_cvtsi32_si64 (data);
410 static force_inline __m128i
411 load_32_1x128 (uint32_t data)
413 return _mm_cvtsi32_si128 (data);
416 static force_inline __m64
417 unpack_32_1x64 (uint32_t data)
419 return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
422 static force_inline __m64
423 expand_alpha_1x64 (__m64 data)
425 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
428 static force_inline __m64
429 expand_alpha_rev_1x64 (__m64 data)
431 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
434 static force_inline __m128i
435 expand_alpha_rev_1x128 (__m128i data)
437 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
440 static force_inline __m64
441 expand_pixel_8_1x64 (uint8_t data)
443 return _mm_shuffle_pi16 (
444 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
447 static force_inline __m128i
448 expand_pixel_8_1x128 (uint8_t data)
450 return _mm_shufflelo_epi16 (
451 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
454 static force_inline __m64
455 pix_multiply_1x64 (__m64 data,
458 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
463 static force_inline __m128i
464 pix_multiply_1x128 (__m128i data,
467 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
472 static force_inline __m64
473 pix_add_multiply_1x64 (__m64* src,
478 __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
479 __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
481 return _mm_adds_pu8 (t1, t2);
484 static force_inline __m128i
485 pix_add_multiply_1x128 (__m128i* src,
490 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
491 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
493 return _mm_adds_epu8 (t1, t2);
496 static force_inline __m64
497 negate_1x64 (__m64 data)
499 return _mm_xor_si64 (data, mask_x00ff);
502 static force_inline __m128i
503 negate_1x128 (__m128i data)
505 return _mm_xor_si128 (data, mask_00ff);
508 static force_inline __m64
509 invert_colors_1x64 (__m64 data)
511 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
514 static force_inline __m128i
515 invert_colors_1x128 (__m128i data)
517 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
520 static force_inline __m64
521 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
523 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
526 static force_inline __m128i
527 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
529 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
532 static force_inline __m64
533 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
535 return over_1x64 (pix_multiply_1x64 (*src, *mask),
536 pix_multiply_1x64 (*alpha, *mask),
540 static force_inline __m128i
541 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
543 return over_1x128 (pix_multiply_1x128 (*src, *mask),
544 pix_multiply_1x128 (*alpha, *mask),
548 static force_inline __m64
549 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
551 __m64 alpha = expand_alpha_1x64 (src);
553 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
554 _mm_or_si64 (alpha, mask_x_alpha)),
559 static force_inline __m128i
560 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
562 __m128i alpha = expand_alpha_1x128 (src);
564 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
565 _mm_or_si128 (alpha, mask_alpha)),
570 static force_inline uint32_t
571 pack_1x64_32 (__m64 data)
573 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
576 static force_inline uint32_t
577 pack_1x128_32 (__m128i data)
579 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
582 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
586 * --- Expanding 565 in the low word ---
588 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
589 * m = m & (01f0003f001f);
590 * m = m * (008404100840);
593 * Note the trick here - the top word is shifted by another nibble to
594 * avoid it bumping into the middle word
596 static force_inline __m64
597 expand565_16_1x64 (uint16_t pixel)
602 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
604 t1 = _mm_slli_si64 (p, 36 - 11);
605 t2 = _mm_slli_si64 (p, 16 - 5);
607 p = _mm_or_si64 (t1, p);
608 p = _mm_or_si64 (t2, p);
609 p = _mm_and_si64 (p, mask_x565_rgb);
610 p = _mm_mullo_pi16 (p, mask_x565_unpack);
612 return _mm_srli_pi16 (p, 8);
615 static force_inline __m128i
616 expand565_16_1x128 (uint16_t pixel)
618 __m128i m = _mm_cvtsi32_si128 (pixel);
620 m = unpack_565_to_8888 (m);
622 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
625 /* ----------------------------------------------------------------------------
626 * Compose Core transformations
628 static force_inline uint32_t
629 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
642 xmms = unpack_32_1x128 (src);
643 return pack_1x128_32 (
644 over_1x128 (xmms, expand_alpha_1x128 (xmms),
645 unpack_32_1x128 (dst)));
651 static force_inline uint32_t
652 combine1 (const uint32_t *ps, const uint32_t *pm)
660 mm = unpack_32_1x128 (*pm);
661 mm = expand_alpha_1x128 (mm);
663 ms = unpack_32_1x128 (s);
664 ms = pix_multiply_1x128 (ms, mm);
666 s = pack_1x128_32 (ms);
672 static force_inline __m128i
673 combine4 (const __m128i *ps, const __m128i *pm)
675 __m128i xmm_src_lo, xmm_src_hi;
676 __m128i xmm_msk_lo, xmm_msk_hi;
681 xmm_msk_lo = load_128_unaligned (pm);
683 if (is_transparent (xmm_msk_lo))
684 return _mm_setzero_si128 ();
687 s = load_128_unaligned (ps);
691 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
692 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
694 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
696 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
697 &xmm_msk_lo, &xmm_msk_hi,
698 &xmm_src_lo, &xmm_src_hi);
700 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
706 static force_inline void
707 core_combine_over_u_sse2_mask (uint32_t * pd,
714 /* Align dst on a 16-byte boundary */
715 while (w && ((unsigned long)pd & 15))
718 s = combine1 (ps, pm);
721 *pd = core_combine_over_u_pixel_sse2 (s, d);
730 __m128i mask = load_128_unaligned ((__m128i *)pm);
735 __m128i src_hi, src_lo;
736 __m128i mask_hi, mask_lo;
737 __m128i alpha_hi, alpha_lo;
739 src = load_128_unaligned ((__m128i *)ps);
741 if (is_opaque (_mm_and_si128 (src, mask)))
743 save_128_aligned ((__m128i *)pd, src);
747 __m128i dst = load_128_aligned ((__m128i *)pd);
748 __m128i dst_hi, dst_lo;
750 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
751 unpack_128_2x128 (src, &src_lo, &src_hi);
753 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
754 pix_multiply_2x128 (&src_lo, &src_hi,
758 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
760 expand_alpha_2x128 (src_lo, src_hi,
761 &alpha_lo, &alpha_hi);
763 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
768 pack_2x128_128 (dst_lo, dst_hi));
780 s = combine1 (ps, pm);
783 *pd = core_combine_over_u_pixel_sse2 (s, d);
792 static force_inline void
793 core_combine_over_u_sse2_no_mask (uint32_t * pd,
799 /* Align dst on a 16-byte boundary */
800 while (w && ((unsigned long)pd & 15))
806 *pd = core_combine_over_u_pixel_sse2 (s, d);
815 __m128i src_hi, src_lo, dst_hi, dst_lo;
816 __m128i alpha_hi, alpha_lo;
818 src = load_128_unaligned ((__m128i *)ps);
824 save_128_aligned ((__m128i *)pd, src);
828 __m128i dst = load_128_aligned ((__m128i *)pd);
830 unpack_128_2x128 (src, &src_lo, &src_hi);
831 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
833 expand_alpha_2x128 (src_lo, src_hi,
834 &alpha_lo, &alpha_hi);
835 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
840 pack_2x128_128 (dst_lo, dst_hi));
854 *pd = core_combine_over_u_pixel_sse2 (s, d);
862 static force_inline void
863 core_combine_over_u_sse2 (uint32_t* pd,
869 core_combine_over_u_sse2_mask (pd, ps, pm, w);
871 core_combine_over_u_sse2_no_mask (pd, ps, w);
874 static force_inline void
875 core_combine_over_reverse_u_sse2 (uint32_t* pd,
882 __m128i xmm_dst_lo, xmm_dst_hi;
883 __m128i xmm_src_lo, xmm_src_hi;
884 __m128i xmm_alpha_lo, xmm_alpha_hi;
886 /* Align dst on a 16-byte boundary */
888 ((unsigned long)pd & 15))
891 s = combine1 (ps, pm);
893 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
902 /* I'm loading unaligned because I'm not sure
903 * about the address alignment.
905 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
906 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
908 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
909 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
911 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
912 &xmm_alpha_lo, &xmm_alpha_hi);
914 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
915 &xmm_alpha_lo, &xmm_alpha_hi,
916 &xmm_src_lo, &xmm_src_hi);
918 /* rebuid the 4 pixel data and save*/
919 save_128_aligned ((__m128i*)pd,
920 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
933 s = combine1 (ps, pm);
935 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
943 static force_inline uint32_t
944 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
946 uint32_t maska = src >> 24;
952 else if (maska != 0xff)
954 return pack_1x128_32 (
955 pix_multiply_1x128 (unpack_32_1x128 (dst),
956 expand_alpha_1x128 (unpack_32_1x128 (src))));
962 static force_inline void
963 core_combine_in_u_sse2 (uint32_t* pd,
970 __m128i xmm_src_lo, xmm_src_hi;
971 __m128i xmm_dst_lo, xmm_dst_hi;
973 while (w && ((unsigned long) pd & 15))
975 s = combine1 (ps, pm);
978 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
987 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
988 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
990 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
991 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
993 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
994 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
995 &xmm_dst_lo, &xmm_dst_hi,
996 &xmm_dst_lo, &xmm_dst_hi);
998 save_128_aligned ((__m128i*)pd,
999 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1010 s = combine1 (ps, pm);
1013 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
1021 static force_inline void
1022 core_combine_reverse_in_u_sse2 (uint32_t* pd,
1029 __m128i xmm_src_lo, xmm_src_hi;
1030 __m128i xmm_dst_lo, xmm_dst_hi;
1032 while (w && ((unsigned long) pd & 15))
1034 s = combine1 (ps, pm);
1037 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
1046 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1047 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1049 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1050 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1052 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1053 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1054 &xmm_src_lo, &xmm_src_hi,
1055 &xmm_dst_lo, &xmm_dst_hi);
1058 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1069 s = combine1 (ps, pm);
1072 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
1080 static force_inline void
1081 core_combine_reverse_out_u_sse2 (uint32_t* pd,
1086 while (w && ((unsigned long) pd & 15))
1088 uint32_t s = combine1 (ps, pm);
1091 *pd++ = pack_1x128_32 (
1092 pix_multiply_1x128 (
1093 unpack_32_1x128 (d), negate_1x128 (
1094 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1104 __m128i xmm_src_lo, xmm_src_hi;
1105 __m128i xmm_dst_lo, xmm_dst_hi;
1107 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1108 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1110 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1111 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1113 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1114 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1116 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1117 &xmm_src_lo, &xmm_src_hi,
1118 &xmm_dst_lo, &xmm_dst_hi);
1121 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1133 uint32_t s = combine1 (ps, pm);
1136 *pd++ = pack_1x128_32 (
1137 pix_multiply_1x128 (
1138 unpack_32_1x128 (d), negate_1x128 (
1139 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1147 static force_inline void
1148 core_combine_out_u_sse2 (uint32_t* pd,
1153 while (w && ((unsigned long) pd & 15))
1155 uint32_t s = combine1 (ps, pm);
1158 *pd++ = pack_1x128_32 (
1159 pix_multiply_1x128 (
1160 unpack_32_1x128 (s), negate_1x128 (
1161 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1170 __m128i xmm_src_lo, xmm_src_hi;
1171 __m128i xmm_dst_lo, xmm_dst_hi;
1173 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1174 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1176 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1177 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1179 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1180 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1182 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1183 &xmm_dst_lo, &xmm_dst_hi,
1184 &xmm_dst_lo, &xmm_dst_hi);
1187 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1198 uint32_t s = combine1 (ps, pm);
1201 *pd++ = pack_1x128_32 (
1202 pix_multiply_1x128 (
1203 unpack_32_1x128 (s), negate_1x128 (
1204 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1212 static force_inline uint32_t
1213 core_combine_atop_u_pixel_sse2 (uint32_t src,
1216 __m128i s = unpack_32_1x128 (src);
1217 __m128i d = unpack_32_1x128 (dst);
1219 __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1220 __m128i da = expand_alpha_1x128 (d);
1222 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1225 static force_inline void
1226 core_combine_atop_u_sse2 (uint32_t* pd,
1233 __m128i xmm_src_lo, xmm_src_hi;
1234 __m128i xmm_dst_lo, xmm_dst_hi;
1235 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1236 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1238 while (w && ((unsigned long) pd & 15))
1240 s = combine1 (ps, pm);
1243 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1252 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1253 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1255 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1256 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1258 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1259 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1260 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1261 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1263 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1264 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1266 pix_add_multiply_2x128 (
1267 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1268 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1269 &xmm_dst_lo, &xmm_dst_hi);
1272 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1283 s = combine1 (ps, pm);
1286 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1294 static force_inline uint32_t
1295 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1298 __m128i s = unpack_32_1x128 (src);
1299 __m128i d = unpack_32_1x128 (dst);
1301 __m128i sa = expand_alpha_1x128 (s);
1302 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1304 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1307 static force_inline void
1308 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1315 __m128i xmm_src_lo, xmm_src_hi;
1316 __m128i xmm_dst_lo, xmm_dst_hi;
1317 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1318 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1320 while (w && ((unsigned long) pd & 15))
1322 s = combine1 (ps, pm);
1325 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1334 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1335 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1337 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1338 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1340 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1341 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1342 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1343 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1345 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1346 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1348 pix_add_multiply_2x128 (
1349 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1350 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1351 &xmm_dst_lo, &xmm_dst_hi);
1354 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1365 s = combine1 (ps, pm);
1368 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1376 static force_inline uint32_t
1377 core_combine_xor_u_pixel_sse2 (uint32_t src,
1380 __m128i s = unpack_32_1x128 (src);
1381 __m128i d = unpack_32_1x128 (dst);
1383 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1384 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1386 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1389 static force_inline void
1390 core_combine_xor_u_sse2 (uint32_t* dst,
1391 const uint32_t* src,
1392 const uint32_t *mask,
1398 const uint32_t* ps = src;
1399 const uint32_t* pm = mask;
1401 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1402 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1403 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1404 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1406 while (w && ((unsigned long) pd & 15))
1408 s = combine1 (ps, pm);
1411 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1420 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1421 xmm_dst = load_128_aligned ((__m128i*) pd);
1423 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1424 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1426 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1427 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1428 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1429 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1431 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1432 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1433 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1434 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1436 pix_add_multiply_2x128 (
1437 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1438 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1439 &xmm_dst_lo, &xmm_dst_hi);
1442 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1453 s = combine1 (ps, pm);
1456 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1464 static force_inline void
1465 core_combine_add_u_sse2 (uint32_t* dst,
1466 const uint32_t* src,
1467 const uint32_t* mask,
1473 const uint32_t* ps = src;
1474 const uint32_t* pm = mask;
1476 while (w && (unsigned long)pd & 15)
1478 s = combine1 (ps, pm);
1484 *pd++ = _mm_cvtsi128_si32 (
1485 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1493 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1496 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1507 s = combine1 (ps, pm);
1511 *pd++ = _mm_cvtsi128_si32 (
1512 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1518 static force_inline uint32_t
1519 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1522 __m128i ms = unpack_32_1x128 (src);
1523 __m128i md = unpack_32_1x128 (dst);
1524 uint32_t sa = src >> 24;
1525 uint32_t da = ~dst >> 24;
1529 ms = pix_multiply_1x128 (
1530 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1533 return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1536 static force_inline void
1537 core_combine_saturate_u_sse2 (uint32_t * pd,
1545 __m128i xmm_src, xmm_dst;
1547 while (w && (unsigned long)pd & 15)
1549 s = combine1 (ps, pm);
1552 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1561 xmm_dst = load_128_aligned ((__m128i*)pd);
1562 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1564 pack_cmp = _mm_movemask_epi8 (
1566 _mm_srli_epi32 (xmm_src, 24),
1567 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1569 /* if some alpha src is grater than respective ~alpha dst */
1572 s = combine1 (ps++, pm);
1574 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1578 s = combine1 (ps++, pm);
1580 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1584 s = combine1 (ps++, pm);
1586 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1590 s = combine1 (ps++, pm);
1592 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1598 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1611 s = combine1 (ps, pm);
1614 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1621 static force_inline void
1622 core_combine_src_ca_sse2 (uint32_t* pd,
1629 __m128i xmm_src_lo, xmm_src_hi;
1630 __m128i xmm_mask_lo, xmm_mask_hi;
1631 __m128i xmm_dst_lo, xmm_dst_hi;
1633 while (w && (unsigned long)pd & 15)
1637 *pd++ = pack_1x128_32 (
1638 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1644 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1645 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1647 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1648 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1650 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1651 &xmm_mask_lo, &xmm_mask_hi,
1652 &xmm_dst_lo, &xmm_dst_hi);
1655 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1667 *pd++ = pack_1x128_32 (
1668 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1673 static force_inline uint32_t
1674 core_combine_over_ca_pixel_sse2 (uint32_t src,
1678 __m128i s = unpack_32_1x128 (src);
1679 __m128i expAlpha = expand_alpha_1x128 (s);
1680 __m128i unpk_mask = unpack_32_1x128 (mask);
1681 __m128i unpk_dst = unpack_32_1x128 (dst);
1683 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1686 static force_inline void
1687 core_combine_over_ca_sse2 (uint32_t* pd,
1694 __m128i xmm_alpha_lo, xmm_alpha_hi;
1695 __m128i xmm_src_lo, xmm_src_hi;
1696 __m128i xmm_dst_lo, xmm_dst_hi;
1697 __m128i xmm_mask_lo, xmm_mask_hi;
1699 while (w && (unsigned long)pd & 15)
1705 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1711 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1712 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1713 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1715 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1716 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1717 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1719 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1720 &xmm_alpha_lo, &xmm_alpha_hi);
1722 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1723 &xmm_alpha_lo, &xmm_alpha_hi,
1724 &xmm_mask_lo, &xmm_mask_hi,
1725 &xmm_dst_lo, &xmm_dst_hi);
1728 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1742 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1747 static force_inline uint32_t
1748 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1752 __m128i d = unpack_32_1x128 (dst);
1754 return pack_1x128_32 (
1755 over_1x128 (d, expand_alpha_1x128 (d),
1756 pix_multiply_1x128 (unpack_32_1x128 (src),
1757 unpack_32_1x128 (mask))));
1760 static force_inline void
1761 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1768 __m128i xmm_alpha_lo, xmm_alpha_hi;
1769 __m128i xmm_src_lo, xmm_src_hi;
1770 __m128i xmm_dst_lo, xmm_dst_hi;
1771 __m128i xmm_mask_lo, xmm_mask_hi;
1773 while (w && (unsigned long)pd & 15)
1779 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1785 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1786 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1787 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1789 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1790 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1791 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1793 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1794 &xmm_alpha_lo, &xmm_alpha_hi);
1795 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1796 &xmm_mask_lo, &xmm_mask_hi,
1797 &xmm_mask_lo, &xmm_mask_hi);
1799 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1800 &xmm_alpha_lo, &xmm_alpha_hi,
1801 &xmm_mask_lo, &xmm_mask_hi);
1804 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1818 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1823 static force_inline void
1824 core_combine_in_ca_sse2 (uint32_t * pd,
1831 __m128i xmm_alpha_lo, xmm_alpha_hi;
1832 __m128i xmm_src_lo, xmm_src_hi;
1833 __m128i xmm_dst_lo, xmm_dst_hi;
1834 __m128i xmm_mask_lo, xmm_mask_hi;
1836 while (w && (unsigned long)pd & 15)
1842 *pd++ = pack_1x128_32 (
1843 pix_multiply_1x128 (
1844 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1845 expand_alpha_1x128 (unpack_32_1x128 (d))));
1852 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1853 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1854 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1856 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1857 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1858 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1860 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1861 &xmm_alpha_lo, &xmm_alpha_hi);
1863 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1864 &xmm_mask_lo, &xmm_mask_hi,
1865 &xmm_dst_lo, &xmm_dst_hi);
1867 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1868 &xmm_alpha_lo, &xmm_alpha_hi,
1869 &xmm_dst_lo, &xmm_dst_hi);
1872 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1886 *pd++ = pack_1x128_32 (
1887 pix_multiply_1x128 (
1888 pix_multiply_1x128 (
1889 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1890 expand_alpha_1x128 (unpack_32_1x128 (d))));
1896 static force_inline void
1897 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1904 __m128i xmm_alpha_lo, xmm_alpha_hi;
1905 __m128i xmm_src_lo, xmm_src_hi;
1906 __m128i xmm_dst_lo, xmm_dst_hi;
1907 __m128i xmm_mask_lo, xmm_mask_hi;
1909 while (w && (unsigned long)pd & 15)
1915 *pd++ = pack_1x128_32 (
1916 pix_multiply_1x128 (
1917 unpack_32_1x128 (d),
1918 pix_multiply_1x128 (unpack_32_1x128 (m),
1919 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1925 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1926 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1927 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1929 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1930 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1931 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1933 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1934 &xmm_alpha_lo, &xmm_alpha_hi);
1935 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1936 &xmm_alpha_lo, &xmm_alpha_hi,
1937 &xmm_alpha_lo, &xmm_alpha_hi);
1939 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1940 &xmm_alpha_lo, &xmm_alpha_hi,
1941 &xmm_dst_lo, &xmm_dst_hi);
1944 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1958 *pd++ = pack_1x128_32 (
1959 pix_multiply_1x128 (
1960 unpack_32_1x128 (d),
1961 pix_multiply_1x128 (unpack_32_1x128 (m),
1962 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1967 static force_inline void
1968 core_combine_out_ca_sse2 (uint32_t * pd,
1975 __m128i xmm_alpha_lo, xmm_alpha_hi;
1976 __m128i xmm_src_lo, xmm_src_hi;
1977 __m128i xmm_dst_lo, xmm_dst_hi;
1978 __m128i xmm_mask_lo, xmm_mask_hi;
1980 while (w && (unsigned long)pd & 15)
1986 *pd++ = pack_1x128_32 (
1987 pix_multiply_1x128 (
1988 pix_multiply_1x128 (
1989 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1990 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1996 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1997 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1998 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2000 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2001 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2002 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2004 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2005 &xmm_alpha_lo, &xmm_alpha_hi);
2006 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2007 &xmm_alpha_lo, &xmm_alpha_hi);
2009 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2010 &xmm_mask_lo, &xmm_mask_hi,
2011 &xmm_dst_lo, &xmm_dst_hi);
2012 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2013 &xmm_alpha_lo, &xmm_alpha_hi,
2014 &xmm_dst_lo, &xmm_dst_hi);
2017 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2031 *pd++ = pack_1x128_32 (
2032 pix_multiply_1x128 (
2033 pix_multiply_1x128 (
2034 unpack_32_1x128 (s), unpack_32_1x128 (m)),
2035 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
2041 static force_inline void
2042 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
2049 __m128i xmm_alpha_lo, xmm_alpha_hi;
2050 __m128i xmm_src_lo, xmm_src_hi;
2051 __m128i xmm_dst_lo, xmm_dst_hi;
2052 __m128i xmm_mask_lo, xmm_mask_hi;
2054 while (w && (unsigned long)pd & 15)
2060 *pd++ = pack_1x128_32 (
2061 pix_multiply_1x128 (
2062 unpack_32_1x128 (d),
2063 negate_1x128 (pix_multiply_1x128 (
2064 unpack_32_1x128 (m),
2065 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2071 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2072 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2073 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2075 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2076 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2077 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2079 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2080 &xmm_alpha_lo, &xmm_alpha_hi);
2082 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2083 &xmm_alpha_lo, &xmm_alpha_hi,
2084 &xmm_mask_lo, &xmm_mask_hi);
2086 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2087 &xmm_mask_lo, &xmm_mask_hi);
2089 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2090 &xmm_mask_lo, &xmm_mask_hi,
2091 &xmm_dst_lo, &xmm_dst_hi);
2094 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2108 *pd++ = pack_1x128_32 (
2109 pix_multiply_1x128 (
2110 unpack_32_1x128 (d),
2111 negate_1x128 (pix_multiply_1x128 (
2112 unpack_32_1x128 (m),
2113 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2118 static force_inline uint32_t
2119 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2123 __m128i m = unpack_32_1x128 (mask);
2124 __m128i s = unpack_32_1x128 (src);
2125 __m128i d = unpack_32_1x128 (dst);
2126 __m128i sa = expand_alpha_1x128 (s);
2127 __m128i da = expand_alpha_1x128 (d);
2129 s = pix_multiply_1x128 (s, m);
2130 m = negate_1x128 (pix_multiply_1x128 (m, sa));
2132 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2135 static force_inline void
2136 core_combine_atop_ca_sse2 (uint32_t * pd,
2143 __m128i xmm_src_lo, xmm_src_hi;
2144 __m128i xmm_dst_lo, xmm_dst_hi;
2145 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2146 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2147 __m128i xmm_mask_lo, xmm_mask_hi;
2149 while (w && (unsigned long)pd & 15)
2155 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2161 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2162 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2163 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2165 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2166 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2167 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2169 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2170 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2171 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2172 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2174 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2175 &xmm_mask_lo, &xmm_mask_hi,
2176 &xmm_src_lo, &xmm_src_hi);
2177 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2178 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2179 &xmm_mask_lo, &xmm_mask_hi);
2181 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2183 pix_add_multiply_2x128 (
2184 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2185 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2186 &xmm_dst_lo, &xmm_dst_hi);
2189 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2203 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2208 static force_inline uint32_t
2209 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2213 __m128i m = unpack_32_1x128 (mask);
2214 __m128i s = unpack_32_1x128 (src);
2215 __m128i d = unpack_32_1x128 (dst);
2217 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2218 __m128i sa = expand_alpha_1x128 (s);
2220 s = pix_multiply_1x128 (s, m);
2221 m = pix_multiply_1x128 (m, sa);
2223 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2226 static force_inline void
2227 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2234 __m128i xmm_src_lo, xmm_src_hi;
2235 __m128i xmm_dst_lo, xmm_dst_hi;
2236 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2237 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2238 __m128i xmm_mask_lo, xmm_mask_hi;
2240 while (w && (unsigned long)pd & 15)
2246 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2252 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2253 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2254 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2256 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2257 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2258 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2260 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2261 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2262 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2263 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2265 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2266 &xmm_mask_lo, &xmm_mask_hi,
2267 &xmm_src_lo, &xmm_src_hi);
2268 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2269 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2270 &xmm_mask_lo, &xmm_mask_hi);
2272 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2273 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2275 pix_add_multiply_2x128 (
2276 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2277 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2278 &xmm_dst_lo, &xmm_dst_hi);
2281 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2295 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2300 static force_inline uint32_t
2301 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2305 __m128i a = unpack_32_1x128 (mask);
2306 __m128i s = unpack_32_1x128 (src);
2307 __m128i d = unpack_32_1x128 (dst);
2309 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2310 a, expand_alpha_1x128 (s)));
2311 __m128i dest = pix_multiply_1x128 (s, a);
2312 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2314 return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2320 static force_inline void
2321 core_combine_xor_ca_sse2 (uint32_t * pd,
2328 __m128i xmm_src_lo, xmm_src_hi;
2329 __m128i xmm_dst_lo, xmm_dst_hi;
2330 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2331 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2332 __m128i xmm_mask_lo, xmm_mask_hi;
2334 while (w && (unsigned long)pd & 15)
2340 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2346 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2347 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2348 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2350 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2351 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2352 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2354 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2355 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2356 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2357 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2359 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2360 &xmm_mask_lo, &xmm_mask_hi,
2361 &xmm_src_lo, &xmm_src_hi);
2362 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2363 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2364 &xmm_mask_lo, &xmm_mask_hi);
2366 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2367 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2368 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2369 &xmm_mask_lo, &xmm_mask_hi);
2371 pix_add_multiply_2x128 (
2372 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2373 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2374 &xmm_dst_lo, &xmm_dst_hi);
2377 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2391 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2396 static force_inline void
2397 core_combine_add_ca_sse2 (uint32_t * pd,
2404 __m128i xmm_src_lo, xmm_src_hi;
2405 __m128i xmm_dst_lo, xmm_dst_hi;
2406 __m128i xmm_mask_lo, xmm_mask_hi;
2408 while (w && (unsigned long)pd & 15)
2414 *pd++ = pack_1x128_32 (
2415 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2416 unpack_32_1x128 (m)),
2417 unpack_32_1x128 (d)));
2423 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2424 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2425 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2427 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2428 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2429 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2431 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2432 &xmm_mask_lo, &xmm_mask_hi,
2433 &xmm_src_lo, &xmm_src_hi);
2436 (__m128i*)pd, pack_2x128_128 (
2437 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2438 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2452 *pd++ = pack_1x128_32 (
2453 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2454 unpack_32_1x128 (m)),
2455 unpack_32_1x128 (d)));
2460 /* ---------------------------------------------------
2461 * fb_compose_setup_sSE2
2463 static force_inline __m64
2464 create_mask_16_64 (uint16_t mask)
2466 return _mm_set1_pi16 (mask);
2469 static force_inline __m128i
2470 create_mask_16_128 (uint16_t mask)
2472 return _mm_set1_epi16 (mask);
2475 static force_inline __m64
2476 create_mask_2x32_64 (uint32_t mask0,
2479 return _mm_set_pi32 (mask0, mask1);
2482 /* Work around a code generation bug in Sun Studio 12. */
2483 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2484 # define create_mask_2x32_128(mask0, mask1) \
2485 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2487 static force_inline __m128i
2488 create_mask_2x32_128 (uint32_t mask0,
2491 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2495 /* SSE2 code patch for fbcompose.c */
2498 sse2_combine_over_u (pixman_implementation_t *imp,
2501 const uint32_t * src,
2502 const uint32_t * mask,
2505 core_combine_over_u_sse2 (dst, src, mask, width);
2510 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2513 const uint32_t * src,
2514 const uint32_t * mask,
2517 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2522 sse2_combine_in_u (pixman_implementation_t *imp,
2525 const uint32_t * src,
2526 const uint32_t * mask,
2529 core_combine_in_u_sse2 (dst, src, mask, width);
2534 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2537 const uint32_t * src,
2538 const uint32_t * mask,
2541 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2546 sse2_combine_out_u (pixman_implementation_t *imp,
2549 const uint32_t * src,
2550 const uint32_t * mask,
2553 core_combine_out_u_sse2 (dst, src, mask, width);
2558 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2561 const uint32_t * src,
2562 const uint32_t * mask,
2565 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2570 sse2_combine_atop_u (pixman_implementation_t *imp,
2573 const uint32_t * src,
2574 const uint32_t * mask,
2577 core_combine_atop_u_sse2 (dst, src, mask, width);
2582 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2585 const uint32_t * src,
2586 const uint32_t * mask,
2589 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2594 sse2_combine_xor_u (pixman_implementation_t *imp,
2597 const uint32_t * src,
2598 const uint32_t * mask,
2601 core_combine_xor_u_sse2 (dst, src, mask, width);
2606 sse2_combine_add_u (pixman_implementation_t *imp,
2609 const uint32_t * src,
2610 const uint32_t * mask,
2613 core_combine_add_u_sse2 (dst, src, mask, width);
2618 sse2_combine_saturate_u (pixman_implementation_t *imp,
2621 const uint32_t * src,
2622 const uint32_t * mask,
2625 core_combine_saturate_u_sse2 (dst, src, mask, width);
2630 sse2_combine_src_ca (pixman_implementation_t *imp,
2633 const uint32_t * src,
2634 const uint32_t * mask,
2637 core_combine_src_ca_sse2 (dst, src, mask, width);
2642 sse2_combine_over_ca (pixman_implementation_t *imp,
2645 const uint32_t * src,
2646 const uint32_t * mask,
2649 core_combine_over_ca_sse2 (dst, src, mask, width);
2654 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2657 const uint32_t * src,
2658 const uint32_t * mask,
2661 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2666 sse2_combine_in_ca (pixman_implementation_t *imp,
2669 const uint32_t * src,
2670 const uint32_t * mask,
2673 core_combine_in_ca_sse2 (dst, src, mask, width);
2678 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2681 const uint32_t * src,
2682 const uint32_t * mask,
2685 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2690 sse2_combine_out_ca (pixman_implementation_t *imp,
2693 const uint32_t * src,
2694 const uint32_t * mask,
2697 core_combine_out_ca_sse2 (dst, src, mask, width);
2702 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2705 const uint32_t * src,
2706 const uint32_t * mask,
2709 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2714 sse2_combine_atop_ca (pixman_implementation_t *imp,
2717 const uint32_t * src,
2718 const uint32_t * mask,
2721 core_combine_atop_ca_sse2 (dst, src, mask, width);
2726 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2729 const uint32_t * src,
2730 const uint32_t * mask,
2733 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2738 sse2_combine_xor_ca (pixman_implementation_t *imp,
2741 const uint32_t * src,
2742 const uint32_t * mask,
2745 core_combine_xor_ca_sse2 (dst, src, mask, width);
2750 sse2_combine_add_ca (pixman_implementation_t *imp,
2753 const uint32_t * src,
2754 const uint32_t * mask,
2757 core_combine_add_ca_sse2 (dst, src, mask, width);
2761 /* -------------------------------------------------------------------
2762 * composite_over_n_8888
2766 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2768 pixman_image_t * src_image,
2769 pixman_image_t * mask_image,
2770 pixman_image_t * dst_image,
2781 uint32_t *dst_line, *dst, d;
2784 __m128i xmm_src, xmm_alpha;
2785 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2787 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2792 PIXMAN_IMAGE_GET_LINE (
2793 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2795 xmm_src = expand_pixel_32_1x128 (src);
2796 xmm_alpha = expand_alpha_1x128 (xmm_src);
2802 dst_line += dst_stride;
2805 while (w && (unsigned long)dst & 15)
2808 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2810 unpack_32_1x128 (d)));
2816 xmm_dst = load_128_aligned ((__m128i*)dst);
2818 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2820 over_2x128 (&xmm_src, &xmm_src,
2821 &xmm_alpha, &xmm_alpha,
2822 &xmm_dst_lo, &xmm_dst_hi);
2824 /* rebuid the 4 pixel data and save*/
2826 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2835 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2837 unpack_32_1x128 (d)));
2845 /* ---------------------------------------------------------------------
2846 * composite_over_n_0565
2849 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2851 pixman_image_t * src_image,
2852 pixman_image_t * mask_image,
2853 pixman_image_t * dst_image,
2864 uint16_t *dst_line, *dst, d;
2867 __m128i xmm_src, xmm_alpha;
2868 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2870 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2875 PIXMAN_IMAGE_GET_LINE (
2876 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2878 xmm_src = expand_pixel_32_1x128 (src);
2879 xmm_alpha = expand_alpha_1x128 (xmm_src);
2885 dst_line += dst_stride;
2888 while (w && (unsigned long)dst & 15)
2892 *dst++ = pack_565_32_16 (
2893 pack_1x128_32 (over_1x128 (xmm_src,
2895 expand565_16_1x128 (d))));
2901 xmm_dst = load_128_aligned ((__m128i*)dst);
2903 unpack_565_128_4x128 (xmm_dst,
2904 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2906 over_2x128 (&xmm_src, &xmm_src,
2907 &xmm_alpha, &xmm_alpha,
2908 &xmm_dst0, &xmm_dst1);
2909 over_2x128 (&xmm_src, &xmm_src,
2910 &xmm_alpha, &xmm_alpha,
2911 &xmm_dst2, &xmm_dst3);
2913 xmm_dst = pack_565_4x128_128 (
2914 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2916 save_128_aligned ((__m128i*)dst, xmm_dst);
2925 *dst++ = pack_565_32_16 (
2926 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2927 expand565_16_1x128 (d))));
2934 /* ------------------------------
2935 * composite_add_n_8888_8888_ca
2938 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2940 pixman_image_t * src_image,
2941 pixman_image_t * mask_image,
2942 pixman_image_t * dst_image,
2953 uint32_t *dst_line, d;
2954 uint32_t *mask_line, m;
2956 int dst_stride, mask_stride;
2958 __m128i xmm_src, xmm_alpha;
2960 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2962 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2964 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2970 PIXMAN_IMAGE_GET_LINE (
2971 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2972 PIXMAN_IMAGE_GET_LINE (
2973 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2975 xmm_src = _mm_unpacklo_epi8 (
2976 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2977 xmm_alpha = expand_alpha_1x128 (xmm_src);
2979 mmx_alpha = xmm_alpha;
2984 const uint32_t *pm = (uint32_t *)mask_line;
2985 uint32_t *pd = (uint32_t *)dst_line;
2987 dst_line += dst_stride;
2988 mask_line += mask_stride;
2990 while (w && (unsigned long)pd & 15)
2998 mmx_mask = unpack_32_1x128 (m);
2999 mmx_dest = unpack_32_1x128 (d);
3001 *pd = pack_1x128_32 (
3002 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
3011 xmm_mask = load_128_unaligned ((__m128i*)pm);
3015 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3017 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3018 if (pack_cmp != 0xffff)
3020 xmm_dst = load_128_aligned ((__m128i*)pd);
3022 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3024 pix_multiply_2x128 (&xmm_src, &xmm_src,
3025 &xmm_mask_lo, &xmm_mask_hi,
3026 &xmm_mask_lo, &xmm_mask_hi);
3027 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3030 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3046 mmx_mask = unpack_32_1x128 (m);
3047 mmx_dest = unpack_32_1x128 (d);
3049 *pd = pack_1x128_32 (
3050 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
3061 /* ---------------------------------------------------------------------------
3062 * composite_over_n_8888_8888_ca
3066 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3068 pixman_image_t * src_image,
3069 pixman_image_t * mask_image,
3070 pixman_image_t * dst_image,
3081 uint32_t *dst_line, d;
3082 uint32_t *mask_line, m;
3084 int dst_stride, mask_stride;
3086 __m128i xmm_src, xmm_alpha;
3087 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3088 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3090 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3092 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3097 PIXMAN_IMAGE_GET_LINE (
3098 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3099 PIXMAN_IMAGE_GET_LINE (
3100 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3102 xmm_src = _mm_unpacklo_epi8 (
3103 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3104 xmm_alpha = expand_alpha_1x128 (xmm_src);
3106 mmx_alpha = xmm_alpha;
3111 const uint32_t *pm = (uint32_t *)mask_line;
3112 uint32_t *pd = (uint32_t *)dst_line;
3114 dst_line += dst_stride;
3115 mask_line += mask_stride;
3117 while (w && (unsigned long)pd & 15)
3124 mmx_mask = unpack_32_1x128 (m);
3125 mmx_dest = unpack_32_1x128 (d);
3127 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
3139 xmm_mask = load_128_unaligned ((__m128i*)pm);
3143 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3145 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3146 if (pack_cmp != 0xffff)
3148 xmm_dst = load_128_aligned ((__m128i*)pd);
3150 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3151 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3153 in_over_2x128 (&xmm_src, &xmm_src,
3154 &xmm_alpha, &xmm_alpha,
3155 &xmm_mask_lo, &xmm_mask_hi,
3156 &xmm_dst_lo, &xmm_dst_hi);
3159 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3174 mmx_mask = unpack_32_1x128 (m);
3175 mmx_dest = unpack_32_1x128 (d);
3177 *pd = pack_1x128_32 (
3178 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3189 /*---------------------------------------------------------------------
3190 * composite_over_8888_n_8888
3194 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3196 pixman_image_t * src_image,
3197 pixman_image_t * mask_image,
3198 pixman_image_t * dst_image,
3208 uint32_t *dst_line, *dst;
3209 uint32_t *src_line, *src;
3212 int dst_stride, src_stride;
3215 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3216 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3217 __m128i xmm_alpha_lo, xmm_alpha_hi;
3219 PIXMAN_IMAGE_GET_LINE (
3220 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3221 PIXMAN_IMAGE_GET_LINE (
3222 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3224 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3226 xmm_mask = create_mask_16_128 (mask >> 24);
3231 dst_line += dst_stride;
3233 src_line += src_stride;
3236 while (w && (unsigned long)dst & 15)
3238 uint32_t s = *src++;
3244 __m128i ms = unpack_32_1x128 (s);
3245 __m128i alpha = expand_alpha_1x128 (ms);
3246 __m128i dest = xmm_mask;
3247 __m128i alpha_dst = unpack_32_1x128 (d);
3249 *dst = pack_1x128_32 (
3250 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
3258 xmm_src = load_128_unaligned ((__m128i*)src);
3260 if (!is_zero (xmm_src))
3262 xmm_dst = load_128_aligned ((__m128i*)dst);
3264 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3265 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3266 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3267 &xmm_alpha_lo, &xmm_alpha_hi);
3269 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3270 &xmm_alpha_lo, &xmm_alpha_hi,
3271 &xmm_mask, &xmm_mask,
3272 &xmm_dst_lo, &xmm_dst_hi);
3275 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3285 uint32_t s = *src++;
3291 __m128i ms = unpack_32_1x128 (s);
3292 __m128i alpha = expand_alpha_1x128 (ms);
3293 __m128i mask = xmm_mask;
3294 __m128i dest = unpack_32_1x128 (d);
3296 *dst = pack_1x128_32 (
3297 in_over_1x128 (&ms, &alpha, &mask, &dest));
3308 /*---------------------------------------------------------------------
3309 * composite_over_8888_n_8888
3313 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
3315 pixman_image_t * src_image,
3316 pixman_image_t * mask_image,
3317 pixman_image_t * dst_image,
3327 uint32_t *dst_line, *dst;
3328 uint32_t *src_line, *src;
3330 int dst_stride, src_stride;
3333 PIXMAN_IMAGE_GET_LINE (
3334 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3335 PIXMAN_IMAGE_GET_LINE (
3336 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3341 dst_line += dst_stride;
3343 src_line += src_stride;
3346 while (w && (unsigned long)dst & 15)
3348 *dst++ = *src++ | 0xff000000;
3354 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
3356 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
3357 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
3358 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
3359 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
3361 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
3362 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
3363 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
3364 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
3373 *dst++ = *src++ | 0xff000000;
3381 /* ---------------------------------------------------------------------
3382 * composite_over_x888_n_8888
3385 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3387 pixman_image_t * src_image,
3388 pixman_image_t * mask_image,
3389 pixman_image_t * dst_image,
3399 uint32_t *dst_line, *dst;
3400 uint32_t *src_line, *src;
3402 int dst_stride, src_stride;
3405 __m128i xmm_mask, xmm_alpha;
3406 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3407 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3409 PIXMAN_IMAGE_GET_LINE (
3410 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3411 PIXMAN_IMAGE_GET_LINE (
3412 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3414 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3416 xmm_mask = create_mask_16_128 (mask >> 24);
3417 xmm_alpha = mask_00ff;
3422 dst_line += dst_stride;
3424 src_line += src_stride;
3427 while (w && (unsigned long)dst & 15)
3429 uint32_t s = (*src++) | 0xff000000;
3432 __m128i src = unpack_32_1x128 (s);
3433 __m128i alpha = xmm_alpha;
3434 __m128i mask = xmm_mask;
3435 __m128i dest = unpack_32_1x128 (d);
3437 *dst++ = pack_1x128_32 (
3438 in_over_1x128 (&src, &alpha, &mask, &dest));
3445 xmm_src = _mm_or_si128 (
3446 load_128_unaligned ((__m128i*)src), mask_ff000000);
3447 xmm_dst = load_128_aligned ((__m128i*)dst);
3449 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3450 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3452 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3453 &xmm_alpha, &xmm_alpha,
3454 &xmm_mask, &xmm_mask,
3455 &xmm_dst_lo, &xmm_dst_hi);
3458 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3468 uint32_t s = (*src++) | 0xff000000;
3471 __m128i src = unpack_32_1x128 (s);
3472 __m128i alpha = xmm_alpha;
3473 __m128i mask = xmm_mask;
3474 __m128i dest = unpack_32_1x128 (d);
3476 *dst++ = pack_1x128_32 (
3477 in_over_1x128 (&src, &alpha, &mask, &dest));
3486 /* --------------------------------------------------------------------
3487 * composite_over_8888_8888
3490 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3492 pixman_image_t * src_image,
3493 pixman_image_t * mask_image,
3494 pixman_image_t * dst_image,
3504 int dst_stride, src_stride;
3505 uint32_t *dst_line, *dst;
3506 uint32_t *src_line, *src;
3508 PIXMAN_IMAGE_GET_LINE (
3509 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3510 PIXMAN_IMAGE_GET_LINE (
3511 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3518 core_combine_over_u_sse2 (dst, src, NULL, width);
3526 /* ------------------------------------------------------------------
3527 * composite_over_8888_0565
3529 static force_inline uint16_t
3530 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3534 ms = unpack_32_1x128 (src);
3535 return pack_565_32_16 (
3538 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3542 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3544 pixman_image_t * src_image,
3545 pixman_image_t * mask_image,
3546 pixman_image_t * dst_image,
3556 uint16_t *dst_line, *dst, d;
3557 uint32_t *src_line, *src, s;
3558 int dst_stride, src_stride;
3561 __m128i xmm_alpha_lo, xmm_alpha_hi;
3562 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3563 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3565 PIXMAN_IMAGE_GET_LINE (
3566 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3567 PIXMAN_IMAGE_GET_LINE (
3568 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3573 * I copy the code from MMX one and keep the fixme.
3574 * If it's a problem there, probably is a problem here.
3576 assert (src_image->drawable == mask_image->drawable);
3584 dst_line += dst_stride;
3585 src_line += src_stride;
3588 /* Align dst on a 16-byte boundary */
3590 ((unsigned long)dst & 15))
3595 *dst++ = composite_over_8888_0565pixel (s, d);
3599 /* It's a 8 pixel loop */
3602 /* I'm loading unaligned because I'm not sure
3603 * about the address alignment.
3605 xmm_src = load_128_unaligned ((__m128i*) src);
3606 xmm_dst = load_128_aligned ((__m128i*) dst);
3609 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3610 unpack_565_128_4x128 (xmm_dst,
3611 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3612 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3613 &xmm_alpha_lo, &xmm_alpha_hi);
3615 /* I'm loading next 4 pixels from memory
3616 * before to optimze the memory read.
3618 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3620 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3621 &xmm_alpha_lo, &xmm_alpha_hi,
3622 &xmm_dst0, &xmm_dst1);
3625 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3626 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3627 &xmm_alpha_lo, &xmm_alpha_hi);
3629 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3630 &xmm_alpha_lo, &xmm_alpha_hi,
3631 &xmm_dst2, &xmm_dst3);
3634 (__m128i*)dst, pack_565_4x128_128 (
3635 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3647 *dst++ = composite_over_8888_0565pixel (s, d);
3654 /* -----------------------------------------------------------------
3655 * composite_over_n_8_8888
3659 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3661 pixman_image_t * src_image,
3662 pixman_image_t * mask_image,
3663 pixman_image_t * dst_image,
3674 uint32_t *dst_line, *dst;
3675 uint8_t *mask_line, *mask;
3676 int dst_stride, mask_stride;
3680 __m128i xmm_src, xmm_alpha, xmm_def;
3681 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3682 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3684 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3686 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3692 PIXMAN_IMAGE_GET_LINE (
3693 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3694 PIXMAN_IMAGE_GET_LINE (
3695 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3697 xmm_def = create_mask_2x32_128 (src, src);
3698 xmm_src = expand_pixel_32_1x128 (src);
3699 xmm_alpha = expand_alpha_1x128 (xmm_src);
3701 mmx_alpha = xmm_alpha;
3706 dst_line += dst_stride;
3708 mask_line += mask_stride;
3711 while (w && (unsigned long)dst & 15)
3713 uint8_t m = *mask++;
3718 mmx_mask = expand_pixel_8_1x128 (m);
3719 mmx_dest = unpack_32_1x128 (d);
3721 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3733 m = *((uint32_t*)mask);
3735 if (srca == 0xff && m == 0xffffffff)
3737 save_128_aligned ((__m128i*)dst, xmm_def);
3741 xmm_dst = load_128_aligned ((__m128i*) dst);
3742 xmm_mask = unpack_32_1x128 (m);
3743 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3746 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3747 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3749 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3750 &xmm_mask_lo, &xmm_mask_hi);
3752 in_over_2x128 (&xmm_src, &xmm_src,
3753 &xmm_alpha, &xmm_alpha,
3754 &xmm_mask_lo, &xmm_mask_hi,
3755 &xmm_dst_lo, &xmm_dst_hi);
3758 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3768 uint8_t m = *mask++;
3773 mmx_mask = expand_pixel_8_1x128 (m);
3774 mmx_dest = unpack_32_1x128 (d);
3776 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3790 /* ----------------------------------------------------------------
3791 * composite_over_n_8_8888
3795 pixman_fill_sse2 (uint32_t *bits,
3804 uint32_t byte_width;
3814 stride = stride * (int) sizeof (uint32_t) / 1;
3815 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3821 data = (w << 16) | w;
3825 stride = stride * (int) sizeof (uint32_t) / 2;
3826 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3827 byte_width = 2 * width;
3830 data = (data & 0xffff) * 0x00010001;
3834 stride = stride * (int) sizeof (uint32_t) / 4;
3835 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3836 byte_width = 4 * width;
3844 xmm_def = create_mask_2x32_128 (data, data);
3849 uint8_t *d = byte_line;
3850 byte_line += stride;
3853 while (w >= 1 && ((unsigned long)d & 1))
3855 *(uint8_t *)d = data;
3860 while (w >= 2 && ((unsigned long)d & 3))
3862 *(uint16_t *)d = data;
3867 while (w >= 4 && ((unsigned long)d & 15))
3869 *(uint32_t *)d = data;
3877 save_128_aligned ((__m128i*)(d), xmm_def);
3878 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3879 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3880 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3881 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3882 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3883 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3884 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3892 save_128_aligned ((__m128i*)(d), xmm_def);
3893 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3894 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3895 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3903 save_128_aligned ((__m128i*)(d), xmm_def);
3904 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3912 save_128_aligned ((__m128i*)(d), xmm_def);
3920 *(uint32_t *)d = data;
3928 *(uint16_t *)d = data;
3935 *(uint8_t *)d = data;
3946 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3948 pixman_image_t * src_image,
3949 pixman_image_t * mask_image,
3950 pixman_image_t * dst_image,
3961 uint32_t *dst_line, *dst;
3962 uint8_t *mask_line, *mask;
3963 int dst_stride, mask_stride;
3967 __m128i xmm_src, xmm_def;
3968 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3970 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3975 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3976 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3977 dest_x, dest_y, width, height, 0);
3981 PIXMAN_IMAGE_GET_LINE (
3982 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3983 PIXMAN_IMAGE_GET_LINE (
3984 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3986 xmm_def = create_mask_2x32_128 (src, src);
3987 xmm_src = expand_pixel_32_1x128 (src);
3992 dst_line += dst_stride;
3994 mask_line += mask_stride;
3997 while (w && (unsigned long)dst & 15)
3999 uint8_t m = *mask++;
4003 *dst = pack_1x128_32 (
4004 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
4017 m = *((uint32_t*)mask);
4019 if (srca == 0xff && m == 0xffffffff)
4021 save_128_aligned ((__m128i*)dst, xmm_def);
4025 xmm_mask = unpack_32_1x128 (m);
4026 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4029 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4031 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4032 &xmm_mask_lo, &xmm_mask_hi);
4034 pix_multiply_2x128 (&xmm_src, &xmm_src,
4035 &xmm_mask_lo, &xmm_mask_hi,
4036 &xmm_mask_lo, &xmm_mask_hi);
4039 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4043 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4053 uint8_t m = *mask++;
4057 *dst = pack_1x128_32 (
4058 pix_multiply_1x128 (
4059 xmm_src, expand_pixel_8_1x128 (m)));
4074 /*-----------------------------------------------------------------------
4075 * composite_over_n_8_0565
4079 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4081 pixman_image_t * src_image,
4082 pixman_image_t * mask_image,
4083 pixman_image_t * dst_image,
4094 uint16_t *dst_line, *dst, d;
4095 uint8_t *mask_line, *mask;
4096 int dst_stride, mask_stride;
4099 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4101 __m128i xmm_src, xmm_alpha;
4102 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4103 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4105 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4111 PIXMAN_IMAGE_GET_LINE (
4112 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4113 PIXMAN_IMAGE_GET_LINE (
4114 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4116 xmm_src = expand_pixel_32_1x128 (src);
4117 xmm_alpha = expand_alpha_1x128 (xmm_src);
4119 mmx_alpha = xmm_alpha;
4124 dst_line += dst_stride;
4126 mask_line += mask_stride;
4129 while (w && (unsigned long)dst & 15)
4136 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4137 mmx_dest = expand565_16_1x128 (d);
4139 *dst = pack_565_32_16 (
4142 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4151 xmm_dst = load_128_aligned ((__m128i*) dst);
4152 unpack_565_128_4x128 (xmm_dst,
4153 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4155 m = *((uint32_t*)mask);
4160 xmm_mask = unpack_32_1x128 (m);
4161 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4164 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4166 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4167 &xmm_mask_lo, &xmm_mask_hi);
4169 in_over_2x128 (&xmm_src, &xmm_src,
4170 &xmm_alpha, &xmm_alpha,
4171 &xmm_mask_lo, &xmm_mask_hi,
4172 &xmm_dst0, &xmm_dst1);
4175 m = *((uint32_t*)mask);
4180 xmm_mask = unpack_32_1x128 (m);
4181 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4184 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4186 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4187 &xmm_mask_lo, &xmm_mask_hi);
4188 in_over_2x128 (&xmm_src, &xmm_src,
4189 &xmm_alpha, &xmm_alpha,
4190 &xmm_mask_lo, &xmm_mask_hi,
4191 &xmm_dst2, &xmm_dst3);
4195 (__m128i*)dst, pack_565_4x128_128 (
4196 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4209 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4210 mmx_dest = expand565_16_1x128 (d);
4212 *dst = pack_565_32_16 (
4215 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4226 /* -----------------------------------------------------------------------
4227 * composite_over_pixbuf_0565
4231 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4233 pixman_image_t * src_image,
4234 pixman_image_t * mask_image,
4235 pixman_image_t * dst_image,
4245 uint16_t *dst_line, *dst, d;
4246 uint32_t *src_line, *src, s;
4247 int dst_stride, src_stride;
4249 uint32_t opaque, zero;
4252 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4253 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4255 PIXMAN_IMAGE_GET_LINE (
4256 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4257 PIXMAN_IMAGE_GET_LINE (
4258 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4263 * I copy the code from MMX one and keep the fixme.
4264 * If it's a problem there, probably is a problem here.
4266 assert (src_image->drawable == mask_image->drawable);
4272 dst_line += dst_stride;
4274 src_line += src_stride;
4277 while (w && (unsigned long)dst & 15)
4282 ms = unpack_32_1x128 (s);
4284 *dst++ = pack_565_32_16 (
4286 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
4293 xmm_src = load_128_unaligned ((__m128i*)src);
4294 xmm_dst = load_128_aligned ((__m128i*)dst);
4296 opaque = is_opaque (xmm_src);
4297 zero = is_zero (xmm_src);
4299 unpack_565_128_4x128 (xmm_dst,
4300 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4301 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4303 /* preload next round*/
4304 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4308 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4309 &xmm_dst0, &xmm_dst1);
4313 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4314 &xmm_dst0, &xmm_dst1);
4318 opaque = is_opaque (xmm_src);
4319 zero = is_zero (xmm_src);
4321 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4325 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4326 &xmm_dst2, &xmm_dst3);
4330 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4331 &xmm_dst2, &xmm_dst3);
4335 (__m128i*)dst, pack_565_4x128_128 (
4336 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4348 ms = unpack_32_1x128 (s);
4350 *dst++ = pack_565_32_16 (
4352 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
4360 /* -------------------------------------------------------------------------
4361 * composite_over_pixbuf_8888
4365 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4367 pixman_image_t * src_image,
4368 pixman_image_t * mask_image,
4369 pixman_image_t * dst_image,
4379 uint32_t *dst_line, *dst, d;
4380 uint32_t *src_line, *src, s;
4381 int dst_stride, src_stride;
4383 uint32_t opaque, zero;
4385 __m128i xmm_src_lo, xmm_src_hi;
4386 __m128i xmm_dst_lo, xmm_dst_hi;
4388 PIXMAN_IMAGE_GET_LINE (
4389 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4390 PIXMAN_IMAGE_GET_LINE (
4391 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4396 * I copy the code from MMX one and keep the fixme.
4397 * If it's a problem there, probably is a problem here.
4399 assert (src_image->drawable == mask_image->drawable);
4405 dst_line += dst_stride;
4407 src_line += src_stride;
4410 while (w && (unsigned long)dst & 15)
4415 *dst++ = pack_1x128_32 (
4416 over_rev_non_pre_1x128 (
4417 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4424 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4426 opaque = is_opaque (xmm_src_hi);
4427 zero = is_zero (xmm_src_hi);
4429 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4433 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4434 &xmm_dst_lo, &xmm_dst_hi);
4437 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4441 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4443 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4445 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4446 &xmm_dst_lo, &xmm_dst_hi);
4449 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4462 *dst++ = pack_1x128_32 (
4463 over_rev_non_pre_1x128 (
4464 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4473 /* -------------------------------------------------------------------------------------------------
4474 * composite_over_n_8888_0565_ca
4478 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4480 pixman_image_t * src_image,
4481 pixman_image_t * mask_image,
4482 pixman_image_t * dst_image,
4493 uint16_t *dst_line, *dst, d;
4494 uint32_t *mask_line, *mask, m;
4495 int dst_stride, mask_stride;
4499 __m128i xmm_src, xmm_alpha;
4500 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4501 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4503 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4505 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4510 PIXMAN_IMAGE_GET_LINE (
4511 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4512 PIXMAN_IMAGE_GET_LINE (
4513 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4515 xmm_src = expand_pixel_32_1x128 (src);
4516 xmm_alpha = expand_alpha_1x128 (xmm_src);
4518 mmx_alpha = xmm_alpha;
4525 mask_line += mask_stride;
4526 dst_line += dst_stride;
4528 while (w && ((unsigned long)dst & 15))
4530 m = *(uint32_t *) mask;
4535 mmx_mask = unpack_32_1x128 (m);
4536 mmx_dest = expand565_16_1x128 (d);
4538 *dst = pack_565_32_16 (
4541 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4552 xmm_mask = load_128_unaligned ((__m128i*)mask);
4553 xmm_dst = load_128_aligned ((__m128i*)dst);
4555 pack_cmp = _mm_movemask_epi8 (
4556 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4558 unpack_565_128_4x128 (xmm_dst,
4559 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4560 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4562 /* preload next round */
4563 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4565 /* preload next round */
4566 if (pack_cmp != 0xffff)
4568 in_over_2x128 (&xmm_src, &xmm_src,
4569 &xmm_alpha, &xmm_alpha,
4570 &xmm_mask_lo, &xmm_mask_hi,
4571 &xmm_dst0, &xmm_dst1);
4575 pack_cmp = _mm_movemask_epi8 (
4576 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4578 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4580 if (pack_cmp != 0xffff)
4582 in_over_2x128 (&xmm_src, &xmm_src,
4583 &xmm_alpha, &xmm_alpha,
4584 &xmm_mask_lo, &xmm_mask_hi,
4585 &xmm_dst2, &xmm_dst3);
4589 (__m128i*)dst, pack_565_4x128_128 (
4590 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4599 m = *(uint32_t *) mask;
4604 mmx_mask = unpack_32_1x128 (m);
4605 mmx_dest = expand565_16_1x128 (d);
4607 *dst = pack_565_32_16 (
4610 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4622 /* -----------------------------------------------------------------------
4623 * composite_in_n_8_8
4627 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4629 pixman_image_t * src_image,
4630 pixman_image_t * mask_image,
4631 pixman_image_t * dst_image,
4641 uint8_t *dst_line, *dst;
4642 uint8_t *mask_line, *mask;
4643 int dst_stride, mask_stride;
4650 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4651 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4653 PIXMAN_IMAGE_GET_LINE (
4654 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4655 PIXMAN_IMAGE_GET_LINE (
4656 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4658 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4662 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4667 dst_line += dst_stride;
4669 mask_line += mask_stride;
4672 while (w && ((unsigned long)dst & 15))
4674 m = (uint32_t) *mask++;
4675 d = (uint32_t) *dst;
4677 *dst++ = (uint8_t) pack_1x128_32 (
4678 pix_multiply_1x128 (
4679 pix_multiply_1x128 (xmm_alpha,
4680 unpack_32_1x128 (m)),
4681 unpack_32_1x128 (d)));
4687 xmm_mask = load_128_unaligned ((__m128i*)mask);
4688 xmm_dst = load_128_aligned ((__m128i*)dst);
4690 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4691 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4693 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4694 &xmm_mask_lo, &xmm_mask_hi,
4695 &xmm_mask_lo, &xmm_mask_hi);
4697 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4698 &xmm_dst_lo, &xmm_dst_hi,
4699 &xmm_dst_lo, &xmm_dst_hi);
4702 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4711 m = (uint32_t) *mask++;
4712 d = (uint32_t) *dst;
4714 *dst++ = (uint8_t) pack_1x128_32 (
4715 pix_multiply_1x128 (
4716 pix_multiply_1x128 (
4717 xmm_alpha, unpack_32_1x128 (m)),
4718 unpack_32_1x128 (d)));
4726 /* -----------------------------------------------------------------------
4731 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4733 pixman_image_t * src_image,
4734 pixman_image_t * mask_image,
4735 pixman_image_t * dst_image,
4745 uint8_t *dst_line, *dst;
4752 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4754 PIXMAN_IMAGE_GET_LINE (
4755 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4757 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4759 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4768 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4769 8, dest_x, dest_y, width, height, src);
4777 dst_line += dst_stride;
4780 while (w && ((unsigned long)dst & 15))
4782 d = (uint32_t) *dst;
4784 *dst++ = (uint8_t) pack_1x128_32 (
4785 pix_multiply_1x128 (
4787 unpack_32_1x128 (d)));
4793 xmm_dst = load_128_aligned ((__m128i*)dst);
4795 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4797 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4798 &xmm_dst_lo, &xmm_dst_hi,
4799 &xmm_dst_lo, &xmm_dst_hi);
4802 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4810 d = (uint32_t) *dst;
4812 *dst++ = (uint8_t) pack_1x128_32 (
4813 pix_multiply_1x128 (
4815 unpack_32_1x128 (d)));
4823 /* ---------------------------------------------------------------------------
4828 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4830 pixman_image_t * src_image,
4831 pixman_image_t * mask_image,
4832 pixman_image_t * dst_image,
4842 uint8_t *dst_line, *dst;
4843 uint8_t *src_line, *src;
4844 int src_stride, dst_stride;
4848 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4849 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4851 PIXMAN_IMAGE_GET_LINE (
4852 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4853 PIXMAN_IMAGE_GET_LINE (
4854 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4859 dst_line += dst_stride;
4861 src_line += src_stride;
4864 while (w && ((unsigned long)dst & 15))
4866 s = (uint32_t) *src++;
4867 d = (uint32_t) *dst;
4869 *dst++ = (uint8_t) pack_1x128_32 (
4870 pix_multiply_1x128 (
4871 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4877 xmm_src = load_128_unaligned ((__m128i*)src);
4878 xmm_dst = load_128_aligned ((__m128i*)dst);
4880 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4881 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4883 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4884 &xmm_dst_lo, &xmm_dst_hi,
4885 &xmm_dst_lo, &xmm_dst_hi);
4888 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4897 s = (uint32_t) *src++;
4898 d = (uint32_t) *dst;
4900 *dst++ = (uint8_t) pack_1x128_32 (
4901 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4909 /* -------------------------------------------------------------------------
4910 * composite_add_n_8_8
4914 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4916 pixman_image_t * src_image,
4917 pixman_image_t * mask_image,
4918 pixman_image_t * dst_image,
4928 uint8_t *dst_line, *dst;
4929 uint8_t *mask_line, *mask;
4930 int dst_stride, mask_stride;
4937 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4938 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4940 PIXMAN_IMAGE_GET_LINE (
4941 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4942 PIXMAN_IMAGE_GET_LINE (
4943 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4945 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4949 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4954 dst_line += dst_stride;
4956 mask_line += mask_stride;
4959 while (w && ((unsigned long)dst & 15))
4961 m = (uint32_t) *mask++;
4962 d = (uint32_t) *dst;
4964 *dst++ = (uint8_t) pack_1x128_32 (
4966 pix_multiply_1x128 (
4967 xmm_alpha, unpack_32_1x128 (m)),
4968 unpack_32_1x128 (d)));
4974 xmm_mask = load_128_unaligned ((__m128i*)mask);
4975 xmm_dst = load_128_aligned ((__m128i*)dst);
4977 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4978 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4980 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4981 &xmm_mask_lo, &xmm_mask_hi,
4982 &xmm_mask_lo, &xmm_mask_hi);
4984 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4985 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4988 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4997 m = (uint32_t) *mask++;
4998 d = (uint32_t) *dst;
5000 *dst++ = (uint8_t) pack_1x128_32 (
5002 pix_multiply_1x128 (
5003 xmm_alpha, unpack_32_1x128 (m)),
5004 unpack_32_1x128 (d)));
5013 /* -------------------------------------------------------------------------
5014 * composite_add_n_8_8
5018 sse2_composite_add_n_8 (pixman_implementation_t *imp,
5020 pixman_image_t * src_image,
5021 pixman_image_t * mask_image,
5022 pixman_image_t * dst_image,
5032 uint8_t *dst_line, *dst;
5039 PIXMAN_IMAGE_GET_LINE (
5040 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5042 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5051 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
5052 8, dest_x, dest_y, width, height, 0xff);
5057 src = (src << 24) | (src << 16) | (src << 8) | src;
5058 xmm_src = _mm_set_epi32 (src, src, src, src);
5063 dst_line += dst_stride;
5066 while (w && ((unsigned long)dst & 15))
5068 *dst = (uint8_t)_mm_cvtsi128_si32 (
5071 _mm_cvtsi32_si128 (*dst)));
5080 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
5088 *dst = (uint8_t)_mm_cvtsi128_si32 (
5091 _mm_cvtsi32_si128 (*dst)));
5101 /* ----------------------------------------------------------------------
5106 sse2_composite_add_8_8 (pixman_implementation_t *imp,
5108 pixman_image_t * src_image,
5109 pixman_image_t * mask_image,
5110 pixman_image_t * dst_image,
5120 uint8_t *dst_line, *dst;
5121 uint8_t *src_line, *src;
5122 int dst_stride, src_stride;
5126 PIXMAN_IMAGE_GET_LINE (
5127 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5128 PIXMAN_IMAGE_GET_LINE (
5129 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5136 dst_line += dst_stride;
5137 src_line += src_stride;
5141 while (w && (unsigned long)dst & 3)
5143 t = (*dst) + (*src++);
5144 *dst++ = t | (0 - (t >> 8));
5148 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5158 t = (*dst) + (*src++);
5159 *dst++ = t | (0 - (t >> 8));
5167 /* ---------------------------------------------------------------------
5168 * composite_add_8888_8888
5171 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5173 pixman_image_t * src_image,
5174 pixman_image_t * mask_image,
5175 pixman_image_t * dst_image,
5185 uint32_t *dst_line, *dst;
5186 uint32_t *src_line, *src;
5187 int dst_stride, src_stride;
5189 PIXMAN_IMAGE_GET_LINE (
5190 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5191 PIXMAN_IMAGE_GET_LINE (
5192 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5197 dst_line += dst_stride;
5199 src_line += src_stride;
5201 core_combine_add_u_sse2 (dst, src, NULL, width);
5207 /* -------------------------------------------------------------------------------------------------
5208 * sse2_composite_copy_area
5211 static pixman_bool_t
5212 pixman_blt_sse2 (uint32_t *src_bits,
5225 uint8_t * src_bytes;
5226 uint8_t * dst_bytes;
5229 if (src_bpp != dst_bpp)
5234 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5235 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5236 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5237 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5238 byte_width = 2 * width;
5242 else if (src_bpp == 32)
5244 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5245 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5246 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5247 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5248 byte_width = 4 * width;
5260 uint8_t *s = src_bytes;
5261 uint8_t *d = dst_bytes;
5262 src_bytes += src_stride;
5263 dst_bytes += dst_stride;
5266 while (w >= 2 && ((unsigned long)d & 3))
5268 *(uint16_t *)d = *(uint16_t *)s;
5274 while (w >= 4 && ((unsigned long)d & 15))
5276 *(uint32_t *)d = *(uint32_t *)s;
5285 __m128i xmm0, xmm1, xmm2, xmm3;
5287 xmm0 = load_128_unaligned ((__m128i*)(s));
5288 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5289 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5290 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5292 save_128_aligned ((__m128i*)(d), xmm0);
5293 save_128_aligned ((__m128i*)(d + 16), xmm1);
5294 save_128_aligned ((__m128i*)(d + 32), xmm2);
5295 save_128_aligned ((__m128i*)(d + 48), xmm3);
5304 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5313 *(uint32_t *)d = *(uint32_t *)s;
5322 *(uint16_t *)d = *(uint16_t *)s;
5335 sse2_composite_copy_area (pixman_implementation_t *imp,
5337 pixman_image_t * src_image,
5338 pixman_image_t * mask_image,
5339 pixman_image_t * dst_image,
5349 pixman_blt_sse2 (src_image->bits.bits,
5350 dst_image->bits.bits,
5351 src_image->bits.rowstride,
5352 dst_image->bits.rowstride,
5353 PIXMAN_FORMAT_BPP (src_image->bits.format),
5354 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5355 src_x, src_y, dest_x, dest_y, width, height);
5359 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5361 pixman_image_t * src_image,
5362 pixman_image_t * mask_image,
5363 pixman_image_t * dst_image,
5373 uint32_t *src, *src_line, s;
5374 uint32_t *dst, *dst_line, d;
5375 uint8_t *mask, *mask_line;
5377 int src_stride, mask_stride, dst_stride;
5381 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5382 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5383 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5385 PIXMAN_IMAGE_GET_LINE (
5386 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5387 PIXMAN_IMAGE_GET_LINE (
5388 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5389 PIXMAN_IMAGE_GET_LINE (
5390 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5395 src_line += src_stride;
5397 dst_line += dst_stride;
5399 mask_line += mask_stride;
5403 while (w && (unsigned long)dst & 15)
5405 s = 0xff000000 | *src++;
5406 m = (uint32_t) *mask++;
5408 ms = unpack_32_1x128 (s);
5412 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
5413 __m128i md = unpack_32_1x128 (d);
5415 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
5418 *dst++ = pack_1x128_32 (ms);
5424 m = *(uint32_t*) mask;
5425 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5427 if (m == 0xffffffff)
5429 save_128_aligned ((__m128i*)dst, xmm_src);
5433 xmm_dst = load_128_aligned ((__m128i*)dst);
5435 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5437 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5438 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5439 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5441 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5443 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5445 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5456 m = (uint32_t) *mask++;
5460 s = 0xff000000 | *src;
5472 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
5473 md = unpack_32_1x128 (d);
5474 ms = unpack_32_1x128 (s);
5476 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
5491 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5493 pixman_image_t * src_image,
5494 pixman_image_t * mask_image,
5495 pixman_image_t * dst_image,
5505 uint32_t *src, *src_line, s;
5506 uint32_t *dst, *dst_line, d;
5507 uint8_t *mask, *mask_line;
5509 int src_stride, mask_stride, dst_stride;
5512 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5513 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5514 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5516 PIXMAN_IMAGE_GET_LINE (
5517 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5518 PIXMAN_IMAGE_GET_LINE (
5519 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5520 PIXMAN_IMAGE_GET_LINE (
5521 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5526 src_line += src_stride;
5528 dst_line += dst_stride;
5530 mask_line += mask_stride;
5534 while (w && (unsigned long)dst & 15)
5539 m = (uint32_t) *mask++;
5546 if (sa == 0xff && m == 0xff)
5552 __m128i ms, md, ma, msa;
5554 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5555 ms = unpack_32_1x128 (s);
5556 md = unpack_32_1x128 (d);
5558 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5560 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5570 m = *(uint32_t *) mask;
5574 xmm_src = load_128_unaligned ((__m128i*)src);
5576 if (m == 0xffffffff && is_opaque (xmm_src))
5578 save_128_aligned ((__m128i *)dst, xmm_src);
5582 xmm_dst = load_128_aligned ((__m128i *)dst);
5584 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5586 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5587 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5588 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5590 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5591 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5593 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5594 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5596 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5611 m = (uint32_t) *mask++;
5618 if (sa == 0xff && m == 0xff)
5624 __m128i ms, md, ma, msa;
5626 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5627 ms = unpack_32_1x128 (s);
5628 md = unpack_32_1x128 (d);
5630 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5632 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5645 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5647 pixman_image_t * src_image,
5648 pixman_image_t * mask_image,
5649 pixman_image_t * dst_image,
5660 uint32_t *dst_line, *dst;
5662 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5663 __m128i xmm_dsta_hi, xmm_dsta_lo;
5667 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5672 PIXMAN_IMAGE_GET_LINE (
5673 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5675 xmm_src = expand_pixel_32_1x128 (src);
5681 dst_line += dst_stride;
5684 while (w && (unsigned long)dst & 15)
5688 vd = unpack_32_1x128 (*dst);
5690 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5698 __m128i tmp_lo, tmp_hi;
5700 xmm_dst = load_128_aligned ((__m128i*)dst);
5702 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5703 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5708 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5709 &xmm_dsta_lo, &xmm_dsta_hi,
5713 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5723 vd = unpack_32_1x128 (*dst);
5725 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5737 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5739 pixman_image_t * src_image,
5740 pixman_image_t * mask_image,
5741 pixman_image_t * dst_image,
5751 uint32_t *src, *src_line, s;
5752 uint32_t *dst, *dst_line, d;
5753 uint32_t *mask, *mask_line;
5755 int src_stride, mask_stride, dst_stride;
5758 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5759 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5760 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5762 PIXMAN_IMAGE_GET_LINE (
5763 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5764 PIXMAN_IMAGE_GET_LINE (
5765 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5766 PIXMAN_IMAGE_GET_LINE (
5767 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5772 src_line += src_stride;
5774 dst_line += dst_stride;
5776 mask_line += mask_stride;
5780 while (w && (unsigned long)dst & 15)
5785 m = (*mask++) >> 24;
5792 if (sa == 0xff && m == 0xff)
5798 __m128i ms, md, ma, msa;
5800 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5801 ms = unpack_32_1x128 (s);
5802 md = unpack_32_1x128 (d);
5804 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5806 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5816 xmm_mask = load_128_unaligned ((__m128i*)mask);
5818 if (!is_transparent (xmm_mask))
5820 xmm_src = load_128_unaligned ((__m128i*)src);
5822 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5824 save_128_aligned ((__m128i *)dst, xmm_src);
5828 xmm_dst = load_128_aligned ((__m128i *)dst);
5830 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5831 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5832 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5834 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5835 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5837 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5838 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5840 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5855 m = (*mask++) >> 24;
5862 if (sa == 0xff && m == 0xff)
5868 __m128i ms, md, ma, msa;
5870 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5871 ms = unpack_32_1x128 (s);
5872 md = unpack_32_1x128 (d);
5874 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5876 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5888 /* A variant of 'core_combine_over_u_sse2' with minor tweaks */
5889 static force_inline void
5890 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5894 pixman_fixed_t unit_x,
5895 pixman_fixed_t max_vx,
5896 pixman_bool_t fully_transparent_src)
5899 const uint32_t* pm = NULL;
5901 __m128i xmm_dst_lo, xmm_dst_hi;
5902 __m128i xmm_src_lo, xmm_src_hi;
5903 __m128i xmm_alpha_lo, xmm_alpha_hi;
5905 if (fully_transparent_src)
5908 /* Align dst on a 16-byte boundary */
5909 while (w && ((unsigned long)pd & 15))
5912 s = combine1 (ps + (vx >> 16), pm);
5915 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5924 uint32_t tmp1, tmp2, tmp3, tmp4;
5926 tmp1 = ps[vx >> 16];
5928 tmp2 = ps[vx >> 16];
5930 tmp3 = ps[vx >> 16];
5932 tmp4 = ps[vx >> 16];
5935 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5937 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5939 if (is_opaque (xmm_src_hi))
5941 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5943 else if (!is_zero (xmm_src_hi))
5945 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5947 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5948 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5950 expand_alpha_2x128 (
5951 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5953 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5954 &xmm_alpha_lo, &xmm_alpha_hi,
5955 &xmm_dst_lo, &xmm_dst_hi);
5957 /* rebuid the 4 pixel data and save*/
5958 save_128_aligned ((__m128i*)pd,
5959 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5971 s = combine1 (ps + (vx >> 16), pm);
5974 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5983 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5984 scaled_nearest_scanline_sse2_8888_8888_OVER,
5985 uint32_t, uint32_t, COVER)
5986 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5987 scaled_nearest_scanline_sse2_8888_8888_OVER,
5988 uint32_t, uint32_t, NONE)
5989 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5990 scaled_nearest_scanline_sse2_8888_8888_OVER,
5991 uint32_t, uint32_t, PAD)
5993 static force_inline void
5994 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5996 const uint32_t * src,
5999 pixman_fixed_t unit_x,
6000 pixman_fixed_t max_vx,
6001 pixman_bool_t zero_src)
6004 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
6005 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6006 __m128i xmm_alpha_lo, xmm_alpha_hi;
6008 if (zero_src || (*mask >> 24) == 0)
6011 xmm_mask = create_mask_16_128 (*mask >> 24);
6013 while (w && (unsigned long)dst & 15)
6015 uint32_t s = src[pixman_fixed_to_int (vx)];
6022 __m128i ms = unpack_32_1x128 (s);
6023 __m128i alpha = expand_alpha_1x128 (ms);
6024 __m128i dest = xmm_mask;
6025 __m128i alpha_dst = unpack_32_1x128 (d);
6027 *dst = pack_1x128_32 (
6028 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6036 uint32_t tmp1, tmp2, tmp3, tmp4;
6038 tmp1 = src[pixman_fixed_to_int (vx)];
6040 tmp2 = src[pixman_fixed_to_int (vx)];
6042 tmp3 = src[pixman_fixed_to_int (vx)];
6044 tmp4 = src[pixman_fixed_to_int (vx)];
6047 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
6049 if (!is_zero (xmm_src))
6051 xmm_dst = load_128_aligned ((__m128i*)dst);
6053 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6054 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6055 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6056 &xmm_alpha_lo, &xmm_alpha_hi);
6058 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6059 &xmm_alpha_lo, &xmm_alpha_hi,
6060 &xmm_mask, &xmm_mask,
6061 &xmm_dst_lo, &xmm_dst_hi);
6064 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6073 uint32_t s = src[pixman_fixed_to_int (vx)];
6080 __m128i ms = unpack_32_1x128 (s);
6081 __m128i alpha = expand_alpha_1x128 (ms);
6082 __m128i mask = xmm_mask;
6083 __m128i dest = unpack_32_1x128 (d);
6085 *dst = pack_1x128_32 (
6086 in_over_1x128 (&ms, &alpha, &mask, &dest));
6096 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6097 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
6098 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
6099 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6100 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
6101 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
6102 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6103 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
6104 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
6106 static const pixman_fast_path_t sse2_fast_paths[] =
6108 /* PIXMAN_OP_OVER */
6109 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6110 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6111 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6112 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6113 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6114 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6115 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6116 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6117 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6118 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6119 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6120 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6121 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6122 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6123 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6124 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6125 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6126 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6127 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6128 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6129 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6130 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6131 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6132 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6133 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6134 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6135 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6136 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6137 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6138 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6139 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6140 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6141 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6142 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6143 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6144 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6145 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6146 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6147 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6148 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6149 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6150 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6151 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6152 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6153 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6154 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6156 /* PIXMAN_OP_OVER_REVERSE */
6157 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6158 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6161 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6162 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6163 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6164 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6165 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6166 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6169 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6170 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6171 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6172 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6173 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6174 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6175 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6176 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6177 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6178 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6179 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6180 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6181 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6182 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6185 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6186 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6187 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6189 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6190 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6191 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6192 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6193 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6194 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6195 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6196 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6197 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6198 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6199 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6200 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6202 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6203 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6204 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6205 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6210 static pixman_bool_t
6211 sse2_blt (pixman_implementation_t *imp,
6212 uint32_t * src_bits,
6213 uint32_t * dst_bits,
6225 if (!pixman_blt_sse2 (
6226 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6227 src_x, src_y, dst_x, dst_y, width, height))
6230 return _pixman_implementation_blt (
6232 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6233 src_x, src_y, dst_x, dst_y, width, height);
6239 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6240 __attribute__((__force_align_arg_pointer__))
6242 static pixman_bool_t
6243 sse2_fill (pixman_implementation_t *imp,
6253 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
6255 return _pixman_implementation_fill (
6256 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
6263 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6265 int w = iter->width;
6266 __m128i ff000000 = mask_ff000000;
6267 uint32_t *dst = iter->buffer;
6268 uint32_t *src = (uint32_t *)iter->bits;
6270 iter->bits += iter->stride;
6272 while (w && ((unsigned long)dst) & 0x0f)
6274 *dst++ = (*src++) | 0xff000000;
6281 (__m128i *)dst, _mm_or_si128 (
6282 load_128_unaligned ((__m128i *)src), ff000000));
6291 *dst++ = (*src++) | 0xff000000;
6295 return iter->buffer;
6299 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6301 int w = iter->width;
6302 uint32_t *dst = iter->buffer;
6303 uint16_t *src = (uint16_t *)iter->bits;
6304 __m128i ff000000 = mask_ff000000;
6306 iter->bits += iter->stride;
6308 while (w && ((unsigned long)dst) & 0x0f)
6310 uint16_t s = *src++;
6312 *dst++ = CONVERT_0565_TO_8888 (s);
6320 s = _mm_loadu_si128 ((__m128i *)src);
6322 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6323 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6325 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6326 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6335 uint16_t s = *src++;
6337 *dst++ = CONVERT_0565_TO_8888 (s);
6341 return iter->buffer;
6345 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6347 int w = iter->width;
6348 uint32_t *dst = iter->buffer;
6349 uint8_t *src = iter->bits;
6350 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6352 iter->bits += iter->stride;
6354 while (w && (((unsigned long)dst) & 15))
6356 *dst++ = *(src++) << 24;
6362 xmm0 = _mm_loadu_si128((__m128i *)src);
6364 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
6365 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
6366 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6367 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6368 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6369 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6371 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
6372 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
6373 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
6374 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6383 *dst++ = *(src++) << 24;
6387 return iter->buffer;
6392 pixman_format_code_t format;
6393 pixman_iter_get_scanline_t get_scanline;
6396 static const fetcher_info_t fetchers[] =
6398 { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
6399 { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
6400 { PIXMAN_a8, sse2_fetch_a8 },
6405 sse2_src_iter_init (pixman_implementation_t *imp,
6406 pixman_iter_t *iter,
6407 pixman_image_t *image,
6408 int x, int y, int width, int height,
6409 uint8_t *buffer, iter_flags_t flags)
6412 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
6414 if ((flags & ITER_NARROW) &&
6415 (image->common.flags & FLAGS) == FLAGS &&
6417 x + width <= image->bits.width &&
6418 y + height <= image->bits.height)
6420 const fetcher_info_t *f;
6422 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6424 if (image->common.extended_format_code == f->format)
6426 uint8_t *b = (uint8_t *)image->bits.bits;
6427 int s = image->bits.rowstride * 4;
6429 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6431 iter->width = width;
6432 iter->buffer = (uint32_t *)buffer;
6434 iter->get_scanline = f->get_scanline;
6440 _pixman_implementation_src_iter_init (
6441 imp->delegate, iter, image, x, y, width, height, buffer, flags);
6444 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6445 __attribute__((__force_align_arg_pointer__))
6447 pixman_implementation_t *
6448 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6450 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6452 /* SSE2 constants */
6453 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6454 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6455 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6456 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6457 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6458 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6459 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6460 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6461 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6462 mask_0080 = create_mask_16_128 (0x0080);
6463 mask_00ff = create_mask_16_128 (0x00ff);
6464 mask_0101 = create_mask_16_128 (0x0101);
6465 mask_ffff = create_mask_16_128 (0xffff);
6466 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6467 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6470 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
6471 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
6473 mask_x0080 = create_mask_16_64 (0x0080);
6474 mask_x00ff = create_mask_16_64 (0x00ff);
6475 mask_x0101 = create_mask_16_64 (0x0101);
6476 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
6480 /* Set up function pointers */
6482 /* SSE code patch for fbcompose.c */
6483 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6484 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6485 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6486 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6487 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6488 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6489 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6490 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6491 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6492 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6494 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6496 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6497 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6498 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6499 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6500 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6501 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6502 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6503 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6504 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6505 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6506 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6508 imp->blt = sse2_blt;
6509 imp->fill = sse2_fill;
6511 imp->src_iter_init = sse2_src_iter_init;
6516 #endif /* USE_SSE2 */