2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38 #include "pixman-fast-path.h"
40 #if defined(_MSC_VER) && defined(_M_AMD64)
41 /* Windows 64 doesn't allow MMX to be used, so
42 * the pixman-x64-mmx-emulation.h file contains
43 * implementations of those MMX intrinsics that
44 * are used in the SSE2 implementation.
46 # include "pixman-x64-mmx-emulation.h"
51 /* --------------------------------------------------------------------
55 static __m64 mask_x0080;
56 static __m64 mask_x00ff;
57 static __m64 mask_x0101;
58 static __m64 mask_x_alpha;
60 static __m64 mask_x565_rgb;
61 static __m64 mask_x565_unpack;
63 static __m128i mask_0080;
64 static __m128i mask_00ff;
65 static __m128i mask_0101;
66 static __m128i mask_ffff;
67 static __m128i mask_ff000000;
68 static __m128i mask_alpha;
70 static __m128i mask_565_r;
71 static __m128i mask_565_g1, mask_565_g2;
72 static __m128i mask_565_b;
73 static __m128i mask_red;
74 static __m128i mask_green;
75 static __m128i mask_blue;
77 static __m128i mask_565_fix_rb;
78 static __m128i mask_565_fix_g;
80 /* ----------------------------------------------------------------------
83 static force_inline __m128i
84 unpack_32_1x128 (uint32_t data)
86 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
89 static force_inline void
90 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
92 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
93 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
96 static force_inline __m128i
97 unpack_565_to_8888 (__m128i lo)
99 __m128i r, g, b, rb, t;
101 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
102 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
103 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
105 rb = _mm_or_si128 (r, b);
106 t = _mm_and_si128 (rb, mask_565_fix_rb);
107 t = _mm_srli_epi32 (t, 5);
108 rb = _mm_or_si128 (rb, t);
110 t = _mm_and_si128 (g, mask_565_fix_g);
111 t = _mm_srli_epi32 (t, 6);
112 g = _mm_or_si128 (g, t);
114 return _mm_or_si128 (rb, g);
117 static force_inline void
118 unpack_565_128_4x128 (__m128i data,
126 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
127 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
129 lo = unpack_565_to_8888 (lo);
130 hi = unpack_565_to_8888 (hi);
132 unpack_128_2x128 (lo, data0, data1);
133 unpack_128_2x128 (hi, data2, data3);
136 static force_inline uint16_t
137 pack_565_32_16 (uint32_t pixel)
139 return (uint16_t) (((pixel >> 8) & 0xf800) |
140 ((pixel >> 5) & 0x07e0) |
141 ((pixel >> 3) & 0x001f));
144 static force_inline __m128i
145 pack_2x128_128 (__m128i lo, __m128i hi)
147 return _mm_packus_epi16 (lo, hi);
150 static force_inline __m128i
151 pack_565_2x128_128 (__m128i lo, __m128i hi)
154 __m128i r, g1, g2, b;
156 data = pack_2x128_128 (lo, hi);
158 r = _mm_and_si128 (data, mask_565_r);
159 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
160 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
161 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
163 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
166 static force_inline __m128i
167 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
169 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
170 pack_565_2x128_128 (*xmm2, *xmm3));
173 static force_inline int
174 is_opaque (__m128i x)
176 __m128i ffs = _mm_cmpeq_epi8 (x, x);
178 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
181 static force_inline int
184 return _mm_movemask_epi8 (
185 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
188 static force_inline int
189 is_transparent (__m128i x)
191 return (_mm_movemask_epi8 (
192 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
195 static force_inline __m128i
196 expand_pixel_32_1x128 (uint32_t data)
198 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
201 static force_inline __m128i
202 expand_alpha_1x128 (__m128i data)
204 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
205 _MM_SHUFFLE (3, 3, 3, 3)),
206 _MM_SHUFFLE (3, 3, 3, 3));
209 static force_inline void
210 expand_alpha_2x128 (__m128i data_lo,
217 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
218 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
221 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
224 static force_inline void
225 expand_alpha_rev_2x128 (__m128i data_lo,
232 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
233 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
235 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
238 static force_inline void
239 pix_multiply_2x128 (__m128i* data_lo,
248 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
249 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
250 lo = _mm_adds_epu16 (lo, mask_0080);
251 hi = _mm_adds_epu16 (hi, mask_0080);
252 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
253 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
256 static force_inline void
257 pix_add_multiply_2x128 (__m128i* src_lo,
259 __m128i* alpha_dst_lo,
260 __m128i* alpha_dst_hi,
263 __m128i* alpha_src_lo,
264 __m128i* alpha_src_hi,
268 __m128i t1_lo, t1_hi;
269 __m128i t2_lo, t2_hi;
271 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
272 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
274 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
275 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
278 static force_inline void
279 negate_2x128 (__m128i data_lo,
284 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
285 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
288 static force_inline void
289 invert_colors_2x128 (__m128i data_lo,
296 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
297 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
298 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
299 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
302 static force_inline void
303 over_2x128 (__m128i* src_lo,
312 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
314 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
316 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
317 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
320 static force_inline void
321 over_rev_non_pre_2x128 (__m128i src_lo,
327 __m128i alpha_lo, alpha_hi;
329 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
331 lo = _mm_or_si128 (alpha_lo, mask_alpha);
332 hi = _mm_or_si128 (alpha_hi, mask_alpha);
334 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
336 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
338 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
341 static force_inline void
342 in_over_2x128 (__m128i* src_lo,
354 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
355 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
357 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
360 /* load 4 pixels from a 16-byte boundary aligned address */
361 static force_inline __m128i
362 load_128_aligned (__m128i* src)
364 return _mm_load_si128 (src);
367 /* load 4 pixels from a unaligned address */
368 static force_inline __m128i
369 load_128_unaligned (const __m128i* src)
371 return _mm_loadu_si128 (src);
374 /* save 4 pixels using Write Combining memory on a 16-byte
375 * boundary aligned address
377 static force_inline void
378 save_128_write_combining (__m128i* dst,
381 _mm_stream_si128 (dst, data);
384 /* save 4 pixels on a 16-byte boundary aligned address */
385 static force_inline void
386 save_128_aligned (__m128i* dst,
389 _mm_store_si128 (dst, data);
392 /* save 4 pixels on a unaligned address */
393 static force_inline void
394 save_128_unaligned (__m128i* dst,
397 _mm_storeu_si128 (dst, data);
400 /* ------------------------------------------------------------------
404 static force_inline __m64
405 load_32_1x64 (uint32_t data)
407 return _mm_cvtsi32_si64 (data);
410 static force_inline __m64
411 unpack_32_1x64 (uint32_t data)
413 return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
416 static force_inline __m64
417 expand_alpha_1x64 (__m64 data)
419 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
422 static force_inline __m64
423 expand_alpha_rev_1x64 (__m64 data)
425 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
428 static force_inline __m64
429 expand_pixel_8_1x64 (uint8_t data)
431 return _mm_shuffle_pi16 (
432 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
435 static force_inline __m64
436 pix_multiply_1x64 (__m64 data,
439 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
444 static force_inline __m64
445 pix_add_multiply_1x64 (__m64* src,
450 __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
451 __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
453 return _mm_adds_pu8 (t1, t2);
456 static force_inline __m64
457 negate_1x64 (__m64 data)
459 return _mm_xor_si64 (data, mask_x00ff);
462 static force_inline __m64
463 invert_colors_1x64 (__m64 data)
465 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
468 static force_inline __m64
469 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
471 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
474 static force_inline __m64
475 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
477 return over_1x64 (pix_multiply_1x64 (*src, *mask),
478 pix_multiply_1x64 (*alpha, *mask),
482 static force_inline __m64
483 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
485 __m64 alpha = expand_alpha_1x64 (src);
487 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
488 _mm_or_si64 (alpha, mask_x_alpha)),
493 static force_inline uint32_t
494 pack_1x64_32 (__m64 data)
496 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
499 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
503 * --- Expanding 565 in the low word ---
505 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
506 * m = m & (01f0003f001f);
507 * m = m * (008404100840);
510 * Note the trick here - the top word is shifted by another nibble to
511 * avoid it bumping into the middle word
513 static force_inline __m64
514 expand565_16_1x64 (uint16_t pixel)
519 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
521 t1 = _mm_slli_si64 (p, 36 - 11);
522 t2 = _mm_slli_si64 (p, 16 - 5);
524 p = _mm_or_si64 (t1, p);
525 p = _mm_or_si64 (t2, p);
526 p = _mm_and_si64 (p, mask_x565_rgb);
527 p = _mm_mullo_pi16 (p, mask_x565_unpack);
529 return _mm_srli_pi16 (p, 8);
532 /* ----------------------------------------------------------------------------
533 * Compose Core transformations
535 static force_inline uint32_t
536 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
549 ms = unpack_32_1x64 (src);
550 return pack_1x64_32 (
551 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
557 static force_inline uint32_t
558 combine1 (const uint32_t *ps, const uint32_t *pm)
566 mm = unpack_32_1x64 (*pm);
567 mm = expand_alpha_1x64 (mm);
569 ms = unpack_32_1x64 (s);
570 ms = pix_multiply_1x64 (ms, mm);
572 s = pack_1x64_32 (ms);
578 static force_inline __m128i
579 combine4 (const __m128i *ps, const __m128i *pm)
581 __m128i xmm_src_lo, xmm_src_hi;
582 __m128i xmm_msk_lo, xmm_msk_hi;
587 xmm_msk_lo = load_128_unaligned (pm);
589 if (is_transparent (xmm_msk_lo))
590 return _mm_setzero_si128 ();
593 s = load_128_unaligned (ps);
597 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
598 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
600 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
602 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
603 &xmm_msk_lo, &xmm_msk_hi,
604 &xmm_src_lo, &xmm_src_hi);
606 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
612 static force_inline void
613 core_combine_over_u_sse2 (uint32_t* pd,
620 __m128i xmm_dst_lo, xmm_dst_hi;
621 __m128i xmm_src_lo, xmm_src_hi;
622 __m128i xmm_alpha_lo, xmm_alpha_hi;
624 /* Align dst on a 16-byte boundary */
625 while (w && ((unsigned long)pd & 15))
628 s = combine1 (ps, pm);
630 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
639 /* I'm loading unaligned because I'm not sure about
640 * the address alignment.
642 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
644 if (is_opaque (xmm_src_hi))
646 save_128_aligned ((__m128i*)pd, xmm_src_hi);
648 else if (!is_zero (xmm_src_hi))
650 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
652 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
653 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
656 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
658 over_2x128 (&xmm_src_lo, &xmm_src_hi,
659 &xmm_alpha_lo, &xmm_alpha_hi,
660 &xmm_dst_lo, &xmm_dst_hi);
662 /* rebuid the 4 pixel data and save*/
663 save_128_aligned ((__m128i*)pd,
664 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
677 s = combine1 (ps, pm);
679 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
688 static force_inline void
689 core_combine_over_reverse_u_sse2 (uint32_t* pd,
696 __m128i xmm_dst_lo, xmm_dst_hi;
697 __m128i xmm_src_lo, xmm_src_hi;
698 __m128i xmm_alpha_lo, xmm_alpha_hi;
700 /* Align dst on a 16-byte boundary */
702 ((unsigned long)pd & 15))
705 s = combine1 (ps, pm);
707 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
716 /* I'm loading unaligned because I'm not sure
717 * about the address alignment.
719 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
720 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
722 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
723 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
725 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
726 &xmm_alpha_lo, &xmm_alpha_hi);
728 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
729 &xmm_alpha_lo, &xmm_alpha_hi,
730 &xmm_src_lo, &xmm_src_hi);
732 /* rebuid the 4 pixel data and save*/
733 save_128_aligned ((__m128i*)pd,
734 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
747 s = combine1 (ps, pm);
749 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
757 static force_inline uint32_t
758 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
760 uint32_t maska = src >> 24;
766 else if (maska != 0xff)
768 return pack_1x64_32 (
769 pix_multiply_1x64 (unpack_32_1x64 (dst),
770 expand_alpha_1x64 (unpack_32_1x64 (src))));
776 static force_inline void
777 core_combine_in_u_sse2 (uint32_t* pd,
784 __m128i xmm_src_lo, xmm_src_hi;
785 __m128i xmm_dst_lo, xmm_dst_hi;
787 while (w && ((unsigned long) pd & 15))
789 s = combine1 (ps, pm);
792 *pd++ = core_combine_in_u_pixelsse2 (d, s);
801 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
802 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
804 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
805 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
807 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
808 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
809 &xmm_dst_lo, &xmm_dst_hi,
810 &xmm_dst_lo, &xmm_dst_hi);
812 save_128_aligned ((__m128i*)pd,
813 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
824 s = combine1 (ps, pm);
827 *pd++ = core_combine_in_u_pixelsse2 (d, s);
835 static force_inline void
836 core_combine_reverse_in_u_sse2 (uint32_t* pd,
843 __m128i xmm_src_lo, xmm_src_hi;
844 __m128i xmm_dst_lo, xmm_dst_hi;
846 while (w && ((unsigned long) pd & 15))
848 s = combine1 (ps, pm);
851 *pd++ = core_combine_in_u_pixelsse2 (s, d);
860 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
861 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
863 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
866 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
867 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
868 &xmm_src_lo, &xmm_src_hi,
869 &xmm_dst_lo, &xmm_dst_hi);
872 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
883 s = combine1 (ps, pm);
886 *pd++ = core_combine_in_u_pixelsse2 (s, d);
894 static force_inline void
895 core_combine_reverse_out_u_sse2 (uint32_t* pd,
900 while (w && ((unsigned long) pd & 15))
902 uint32_t s = combine1 (ps, pm);
905 *pd++ = pack_1x64_32 (
907 unpack_32_1x64 (d), negate_1x64 (
908 expand_alpha_1x64 (unpack_32_1x64 (s)))));
918 __m128i xmm_src_lo, xmm_src_hi;
919 __m128i xmm_dst_lo, xmm_dst_hi;
921 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
922 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
924 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
925 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
927 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
928 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
930 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
931 &xmm_src_lo, &xmm_src_hi,
932 &xmm_dst_lo, &xmm_dst_hi);
935 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
947 uint32_t s = combine1 (ps, pm);
950 *pd++ = pack_1x64_32 (
952 unpack_32_1x64 (d), negate_1x64 (
953 expand_alpha_1x64 (unpack_32_1x64 (s)))));
961 static force_inline void
962 core_combine_out_u_sse2 (uint32_t* pd,
967 while (w && ((unsigned long) pd & 15))
969 uint32_t s = combine1 (ps, pm);
972 *pd++ = pack_1x64_32 (
974 unpack_32_1x64 (s), negate_1x64 (
975 expand_alpha_1x64 (unpack_32_1x64 (d)))));
984 __m128i xmm_src_lo, xmm_src_hi;
985 __m128i xmm_dst_lo, xmm_dst_hi;
987 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
988 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
990 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
991 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
993 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
994 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
996 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
997 &xmm_dst_lo, &xmm_dst_hi,
998 &xmm_dst_lo, &xmm_dst_hi);
1001 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1012 uint32_t s = combine1 (ps, pm);
1015 *pd++ = pack_1x64_32 (
1017 unpack_32_1x64 (s), negate_1x64 (
1018 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1026 static force_inline uint32_t
1027 core_combine_atop_u_pixel_sse2 (uint32_t src,
1030 __m64 s = unpack_32_1x64 (src);
1031 __m64 d = unpack_32_1x64 (dst);
1033 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1034 __m64 da = expand_alpha_1x64 (d);
1036 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1039 static force_inline void
1040 core_combine_atop_u_sse2 (uint32_t* pd,
1047 __m128i xmm_src_lo, xmm_src_hi;
1048 __m128i xmm_dst_lo, xmm_dst_hi;
1049 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1050 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1052 while (w && ((unsigned long) pd & 15))
1054 s = combine1 (ps, pm);
1057 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1066 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1067 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1069 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1070 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1072 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1073 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1074 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1075 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1077 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1078 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1080 pix_add_multiply_2x128 (
1081 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1082 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1083 &xmm_dst_lo, &xmm_dst_hi);
1086 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1097 s = combine1 (ps, pm);
1100 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1108 static force_inline uint32_t
1109 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1112 __m64 s = unpack_32_1x64 (src);
1113 __m64 d = unpack_32_1x64 (dst);
1115 __m64 sa = expand_alpha_1x64 (s);
1116 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1118 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1121 static force_inline void
1122 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1129 __m128i xmm_src_lo, xmm_src_hi;
1130 __m128i xmm_dst_lo, xmm_dst_hi;
1131 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1132 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1134 while (w && ((unsigned long) pd & 15))
1136 s = combine1 (ps, pm);
1139 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1148 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1149 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1151 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1152 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1154 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1155 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1156 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1157 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1159 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1160 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1162 pix_add_multiply_2x128 (
1163 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1164 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1165 &xmm_dst_lo, &xmm_dst_hi);
1168 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1179 s = combine1 (ps, pm);
1182 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1190 static force_inline uint32_t
1191 core_combine_xor_u_pixel_sse2 (uint32_t src,
1194 __m64 s = unpack_32_1x64 (src);
1195 __m64 d = unpack_32_1x64 (dst);
1197 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1198 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1200 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1203 static force_inline void
1204 core_combine_xor_u_sse2 (uint32_t* dst,
1205 const uint32_t* src,
1206 const uint32_t *mask,
1212 const uint32_t* ps = src;
1213 const uint32_t* pm = mask;
1215 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1216 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1217 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1218 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1220 while (w && ((unsigned long) pd & 15))
1222 s = combine1 (ps, pm);
1225 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1234 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1235 xmm_dst = load_128_aligned ((__m128i*) pd);
1237 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1238 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1240 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1241 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1242 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1243 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1245 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1246 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1247 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1248 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1250 pix_add_multiply_2x128 (
1251 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1252 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1253 &xmm_dst_lo, &xmm_dst_hi);
1256 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1267 s = combine1 (ps, pm);
1270 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1278 static force_inline void
1279 core_combine_add_u_sse2 (uint32_t* dst,
1280 const uint32_t* src,
1281 const uint32_t* mask,
1287 const uint32_t* ps = src;
1288 const uint32_t* pm = mask;
1290 while (w && (unsigned long)pd & 15)
1292 s = combine1 (ps, pm);
1298 *pd++ = _mm_cvtsi64_si32 (
1299 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1307 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1310 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1321 s = combine1 (ps, pm);
1325 *pd++ = _mm_cvtsi64_si32 (
1326 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1332 static force_inline uint32_t
1333 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1336 __m64 ms = unpack_32_1x64 (src);
1337 __m64 md = unpack_32_1x64 (dst);
1338 uint32_t sa = src >> 24;
1339 uint32_t da = ~dst >> 24;
1343 ms = pix_multiply_1x64 (
1344 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1347 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1350 static force_inline void
1351 core_combine_saturate_u_sse2 (uint32_t * pd,
1359 __m128i xmm_src, xmm_dst;
1361 while (w && (unsigned long)pd & 15)
1363 s = combine1 (ps, pm);
1366 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1375 xmm_dst = load_128_aligned ((__m128i*)pd);
1376 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1378 pack_cmp = _mm_movemask_epi8 (
1380 _mm_srli_epi32 (xmm_src, 24),
1381 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1383 /* if some alpha src is grater than respective ~alpha dst */
1386 s = combine1 (ps++, pm);
1388 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1392 s = combine1 (ps++, pm);
1394 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1398 s = combine1 (ps++, pm);
1400 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1404 s = combine1 (ps++, pm);
1406 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1412 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1425 s = combine1 (ps, pm);
1428 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1435 static force_inline void
1436 core_combine_src_ca_sse2 (uint32_t* pd,
1443 __m128i xmm_src_lo, xmm_src_hi;
1444 __m128i xmm_mask_lo, xmm_mask_hi;
1445 __m128i xmm_dst_lo, xmm_dst_hi;
1447 while (w && (unsigned long)pd & 15)
1451 *pd++ = pack_1x64_32 (
1452 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1458 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1459 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1461 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1462 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1464 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1465 &xmm_mask_lo, &xmm_mask_hi,
1466 &xmm_dst_lo, &xmm_dst_hi);
1469 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1481 *pd++ = pack_1x64_32 (
1482 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1487 static force_inline uint32_t
1488 core_combine_over_ca_pixel_sse2 (uint32_t src,
1492 __m64 s = unpack_32_1x64 (src);
1493 __m64 expAlpha = expand_alpha_1x64 (s);
1494 __m64 unpk_mask = unpack_32_1x64 (mask);
1495 __m64 unpk_dst = unpack_32_1x64 (dst);
1497 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1500 static force_inline void
1501 core_combine_over_ca_sse2 (uint32_t* pd,
1508 __m128i xmm_alpha_lo, xmm_alpha_hi;
1509 __m128i xmm_src_lo, xmm_src_hi;
1510 __m128i xmm_dst_lo, xmm_dst_hi;
1511 __m128i xmm_mask_lo, xmm_mask_hi;
1513 while (w && (unsigned long)pd & 15)
1519 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1525 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1526 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1527 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1529 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1530 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1531 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1533 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1534 &xmm_alpha_lo, &xmm_alpha_hi);
1536 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1537 &xmm_alpha_lo, &xmm_alpha_hi,
1538 &xmm_mask_lo, &xmm_mask_hi,
1539 &xmm_dst_lo, &xmm_dst_hi);
1542 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1556 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1561 static force_inline uint32_t
1562 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1566 __m64 d = unpack_32_1x64 (dst);
1568 return pack_1x64_32 (
1569 over_1x64 (d, expand_alpha_1x64 (d),
1570 pix_multiply_1x64 (unpack_32_1x64 (src),
1571 unpack_32_1x64 (mask))));
1574 static force_inline void
1575 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1582 __m128i xmm_alpha_lo, xmm_alpha_hi;
1583 __m128i xmm_src_lo, xmm_src_hi;
1584 __m128i xmm_dst_lo, xmm_dst_hi;
1585 __m128i xmm_mask_lo, xmm_mask_hi;
1587 while (w && (unsigned long)pd & 15)
1593 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1599 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1600 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1601 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1603 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1604 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1605 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1607 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1608 &xmm_alpha_lo, &xmm_alpha_hi);
1609 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1610 &xmm_mask_lo, &xmm_mask_hi,
1611 &xmm_mask_lo, &xmm_mask_hi);
1613 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1614 &xmm_alpha_lo, &xmm_alpha_hi,
1615 &xmm_mask_lo, &xmm_mask_hi);
1618 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1632 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1637 static force_inline void
1638 core_combine_in_ca_sse2 (uint32_t * pd,
1645 __m128i xmm_alpha_lo, xmm_alpha_hi;
1646 __m128i xmm_src_lo, xmm_src_hi;
1647 __m128i xmm_dst_lo, xmm_dst_hi;
1648 __m128i xmm_mask_lo, xmm_mask_hi;
1650 while (w && (unsigned long)pd & 15)
1656 *pd++ = pack_1x64_32 (
1658 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1659 expand_alpha_1x64 (unpack_32_1x64 (d))));
1666 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1667 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1668 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1670 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1671 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1672 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1674 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1675 &xmm_alpha_lo, &xmm_alpha_hi);
1677 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1678 &xmm_mask_lo, &xmm_mask_hi,
1679 &xmm_dst_lo, &xmm_dst_hi);
1681 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1682 &xmm_alpha_lo, &xmm_alpha_hi,
1683 &xmm_dst_lo, &xmm_dst_hi);
1686 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1700 *pd++ = pack_1x64_32 (
1703 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1704 expand_alpha_1x64 (unpack_32_1x64 (d))));
1710 static force_inline void
1711 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1718 __m128i xmm_alpha_lo, xmm_alpha_hi;
1719 __m128i xmm_src_lo, xmm_src_hi;
1720 __m128i xmm_dst_lo, xmm_dst_hi;
1721 __m128i xmm_mask_lo, xmm_mask_hi;
1723 while (w && (unsigned long)pd & 15)
1729 *pd++ = pack_1x64_32 (
1732 pix_multiply_1x64 (unpack_32_1x64 (m),
1733 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1739 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1740 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1741 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1743 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1744 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1745 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1747 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1748 &xmm_alpha_lo, &xmm_alpha_hi);
1749 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1750 &xmm_alpha_lo, &xmm_alpha_hi,
1751 &xmm_alpha_lo, &xmm_alpha_hi);
1753 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1754 &xmm_alpha_lo, &xmm_alpha_hi,
1755 &xmm_dst_lo, &xmm_dst_hi);
1758 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1772 *pd++ = pack_1x64_32 (
1775 pix_multiply_1x64 (unpack_32_1x64 (m),
1776 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1781 static force_inline void
1782 core_combine_out_ca_sse2 (uint32_t * pd,
1789 __m128i xmm_alpha_lo, xmm_alpha_hi;
1790 __m128i xmm_src_lo, xmm_src_hi;
1791 __m128i xmm_dst_lo, xmm_dst_hi;
1792 __m128i xmm_mask_lo, xmm_mask_hi;
1794 while (w && (unsigned long)pd & 15)
1800 *pd++ = pack_1x64_32 (
1803 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1804 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1810 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1811 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1812 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1814 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1815 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1816 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1818 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1819 &xmm_alpha_lo, &xmm_alpha_hi);
1820 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1821 &xmm_alpha_lo, &xmm_alpha_hi);
1823 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1824 &xmm_mask_lo, &xmm_mask_hi,
1825 &xmm_dst_lo, &xmm_dst_hi);
1826 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1827 &xmm_alpha_lo, &xmm_alpha_hi,
1828 &xmm_dst_lo, &xmm_dst_hi);
1831 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1845 *pd++ = pack_1x64_32 (
1848 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1849 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1855 static force_inline void
1856 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
1863 __m128i xmm_alpha_lo, xmm_alpha_hi;
1864 __m128i xmm_src_lo, xmm_src_hi;
1865 __m128i xmm_dst_lo, xmm_dst_hi;
1866 __m128i xmm_mask_lo, xmm_mask_hi;
1868 while (w && (unsigned long)pd & 15)
1874 *pd++ = pack_1x64_32 (
1877 negate_1x64 (pix_multiply_1x64 (
1879 expand_alpha_1x64 (unpack_32_1x64 (s))))));
1885 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1886 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1887 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1889 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1890 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1891 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1893 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1894 &xmm_alpha_lo, &xmm_alpha_hi);
1896 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1897 &xmm_alpha_lo, &xmm_alpha_hi,
1898 &xmm_mask_lo, &xmm_mask_hi);
1900 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1901 &xmm_mask_lo, &xmm_mask_hi);
1903 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1904 &xmm_mask_lo, &xmm_mask_hi,
1905 &xmm_dst_lo, &xmm_dst_hi);
1908 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1922 *pd++ = pack_1x64_32 (
1925 negate_1x64 (pix_multiply_1x64 (
1927 expand_alpha_1x64 (unpack_32_1x64 (s))))));
1932 static force_inline uint32_t
1933 core_combine_atop_ca_pixel_sse2 (uint32_t src,
1937 __m64 m = unpack_32_1x64 (mask);
1938 __m64 s = unpack_32_1x64 (src);
1939 __m64 d = unpack_32_1x64 (dst);
1940 __m64 sa = expand_alpha_1x64 (s);
1941 __m64 da = expand_alpha_1x64 (d);
1943 s = pix_multiply_1x64 (s, m);
1944 m = negate_1x64 (pix_multiply_1x64 (m, sa));
1946 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
1949 static force_inline void
1950 core_combine_atop_ca_sse2 (uint32_t * pd,
1957 __m128i xmm_src_lo, xmm_src_hi;
1958 __m128i xmm_dst_lo, xmm_dst_hi;
1959 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1960 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1961 __m128i xmm_mask_lo, xmm_mask_hi;
1963 while (w && (unsigned long)pd & 15)
1969 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
1975 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1976 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1977 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1979 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1980 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1981 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1983 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1984 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1985 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1986 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1988 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1989 &xmm_mask_lo, &xmm_mask_hi,
1990 &xmm_src_lo, &xmm_src_hi);
1991 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1992 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1993 &xmm_mask_lo, &xmm_mask_hi);
1995 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1997 pix_add_multiply_2x128 (
1998 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
1999 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2000 &xmm_dst_lo, &xmm_dst_hi);
2003 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2017 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2022 static force_inline uint32_t
2023 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2027 __m64 m = unpack_32_1x64 (mask);
2028 __m64 s = unpack_32_1x64 (src);
2029 __m64 d = unpack_32_1x64 (dst);
2031 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2032 __m64 sa = expand_alpha_1x64 (s);
2034 s = pix_multiply_1x64 (s, m);
2035 m = pix_multiply_1x64 (m, sa);
2037 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2040 static force_inline void
2041 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2048 __m128i xmm_src_lo, xmm_src_hi;
2049 __m128i xmm_dst_lo, xmm_dst_hi;
2050 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2051 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2052 __m128i xmm_mask_lo, xmm_mask_hi;
2054 while (w && (unsigned long)pd & 15)
2060 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2066 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2067 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2068 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2070 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2071 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2072 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2074 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2075 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2076 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2077 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2079 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2080 &xmm_mask_lo, &xmm_mask_hi,
2081 &xmm_src_lo, &xmm_src_hi);
2082 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2083 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2084 &xmm_mask_lo, &xmm_mask_hi);
2086 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2087 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2089 pix_add_multiply_2x128 (
2090 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092 &xmm_dst_lo, &xmm_dst_hi);
2095 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2109 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2114 static force_inline uint32_t
2115 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2119 __m64 a = unpack_32_1x64 (mask);
2120 __m64 s = unpack_32_1x64 (src);
2121 __m64 d = unpack_32_1x64 (dst);
2123 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2124 a, expand_alpha_1x64 (s)));
2125 __m64 dest = pix_multiply_1x64 (s, a);
2126 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2128 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2134 static force_inline void
2135 core_combine_xor_ca_sse2 (uint32_t * pd,
2142 __m128i xmm_src_lo, xmm_src_hi;
2143 __m128i xmm_dst_lo, xmm_dst_hi;
2144 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2145 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2146 __m128i xmm_mask_lo, xmm_mask_hi;
2148 while (w && (unsigned long)pd & 15)
2154 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2160 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2164 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2168 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2170 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2171 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2173 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2174 &xmm_mask_lo, &xmm_mask_hi,
2175 &xmm_src_lo, &xmm_src_hi);
2176 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2177 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2178 &xmm_mask_lo, &xmm_mask_hi);
2180 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2181 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2183 &xmm_mask_lo, &xmm_mask_hi);
2185 pix_add_multiply_2x128 (
2186 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2187 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2188 &xmm_dst_lo, &xmm_dst_hi);
2191 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2205 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2210 static force_inline void
2211 core_combine_add_ca_sse2 (uint32_t * pd,
2218 __m128i xmm_src_lo, xmm_src_hi;
2219 __m128i xmm_dst_lo, xmm_dst_hi;
2220 __m128i xmm_mask_lo, xmm_mask_hi;
2222 while (w && (unsigned long)pd & 15)
2228 *pd++ = pack_1x64_32 (
2229 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2230 unpack_32_1x64 (m)),
2231 unpack_32_1x64 (d)));
2237 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2238 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2239 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2241 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2242 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2243 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2245 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2246 &xmm_mask_lo, &xmm_mask_hi,
2247 &xmm_src_lo, &xmm_src_hi);
2250 (__m128i*)pd, pack_2x128_128 (
2251 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2252 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2266 *pd++ = pack_1x64_32 (
2267 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2268 unpack_32_1x64 (m)),
2269 unpack_32_1x64 (d)));
2274 /* ---------------------------------------------------
2275 * fb_compose_setup_sSE2
2277 static force_inline __m64
2278 create_mask_16_64 (uint16_t mask)
2280 return _mm_set1_pi16 (mask);
2283 static force_inline __m128i
2284 create_mask_16_128 (uint16_t mask)
2286 return _mm_set1_epi16 (mask);
2289 static force_inline __m64
2290 create_mask_2x32_64 (uint32_t mask0,
2293 return _mm_set_pi32 (mask0, mask1);
2296 /* Work around a code generation bug in Sun Studio 12. */
2297 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2298 # define create_mask_2x32_128(mask0, mask1) \
2299 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2301 static force_inline __m128i
2302 create_mask_2x32_128 (uint32_t mask0,
2305 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2309 /* SSE2 code patch for fbcompose.c */
2312 sse2_combine_over_u (pixman_implementation_t *imp,
2315 const uint32_t * src,
2316 const uint32_t * mask,
2319 core_combine_over_u_sse2 (dst, src, mask, width);
2324 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2327 const uint32_t * src,
2328 const uint32_t * mask,
2331 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2336 sse2_combine_in_u (pixman_implementation_t *imp,
2339 const uint32_t * src,
2340 const uint32_t * mask,
2343 core_combine_in_u_sse2 (dst, src, mask, width);
2348 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2351 const uint32_t * src,
2352 const uint32_t * mask,
2355 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2360 sse2_combine_out_u (pixman_implementation_t *imp,
2363 const uint32_t * src,
2364 const uint32_t * mask,
2367 core_combine_out_u_sse2 (dst, src, mask, width);
2372 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2375 const uint32_t * src,
2376 const uint32_t * mask,
2379 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2384 sse2_combine_atop_u (pixman_implementation_t *imp,
2387 const uint32_t * src,
2388 const uint32_t * mask,
2391 core_combine_atop_u_sse2 (dst, src, mask, width);
2396 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2399 const uint32_t * src,
2400 const uint32_t * mask,
2403 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2408 sse2_combine_xor_u (pixman_implementation_t *imp,
2411 const uint32_t * src,
2412 const uint32_t * mask,
2415 core_combine_xor_u_sse2 (dst, src, mask, width);
2420 sse2_combine_add_u (pixman_implementation_t *imp,
2423 const uint32_t * src,
2424 const uint32_t * mask,
2427 core_combine_add_u_sse2 (dst, src, mask, width);
2432 sse2_combine_saturate_u (pixman_implementation_t *imp,
2435 const uint32_t * src,
2436 const uint32_t * mask,
2439 core_combine_saturate_u_sse2 (dst, src, mask, width);
2444 sse2_combine_src_ca (pixman_implementation_t *imp,
2447 const uint32_t * src,
2448 const uint32_t * mask,
2451 core_combine_src_ca_sse2 (dst, src, mask, width);
2456 sse2_combine_over_ca (pixman_implementation_t *imp,
2459 const uint32_t * src,
2460 const uint32_t * mask,
2463 core_combine_over_ca_sse2 (dst, src, mask, width);
2468 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2471 const uint32_t * src,
2472 const uint32_t * mask,
2475 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2480 sse2_combine_in_ca (pixman_implementation_t *imp,
2483 const uint32_t * src,
2484 const uint32_t * mask,
2487 core_combine_in_ca_sse2 (dst, src, mask, width);
2492 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2495 const uint32_t * src,
2496 const uint32_t * mask,
2499 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2504 sse2_combine_out_ca (pixman_implementation_t *imp,
2507 const uint32_t * src,
2508 const uint32_t * mask,
2511 core_combine_out_ca_sse2 (dst, src, mask, width);
2516 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2519 const uint32_t * src,
2520 const uint32_t * mask,
2523 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2528 sse2_combine_atop_ca (pixman_implementation_t *imp,
2531 const uint32_t * src,
2532 const uint32_t * mask,
2535 core_combine_atop_ca_sse2 (dst, src, mask, width);
2540 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2543 const uint32_t * src,
2544 const uint32_t * mask,
2547 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2552 sse2_combine_xor_ca (pixman_implementation_t *imp,
2555 const uint32_t * src,
2556 const uint32_t * mask,
2559 core_combine_xor_ca_sse2 (dst, src, mask, width);
2564 sse2_combine_add_ca (pixman_implementation_t *imp,
2567 const uint32_t * src,
2568 const uint32_t * mask,
2571 core_combine_add_ca_sse2 (dst, src, mask, width);
2575 /* -------------------------------------------------------------------
2576 * composite_over_n_8888
2580 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2582 pixman_image_t * src_image,
2583 pixman_image_t * mask_image,
2584 pixman_image_t * dst_image,
2595 uint32_t *dst_line, *dst, d;
2598 __m128i xmm_src, xmm_alpha;
2599 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2601 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2606 PIXMAN_IMAGE_GET_LINE (
2607 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2609 xmm_src = expand_pixel_32_1x128 (src);
2610 xmm_alpha = expand_alpha_1x128 (xmm_src);
2616 dst_line += dst_stride;
2619 while (w && (unsigned long)dst & 15)
2622 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2623 _mm_movepi64_pi64 (xmm_alpha),
2624 unpack_32_1x64 (d)));
2630 xmm_dst = load_128_aligned ((__m128i*)dst);
2632 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2634 over_2x128 (&xmm_src, &xmm_src,
2635 &xmm_alpha, &xmm_alpha,
2636 &xmm_dst_lo, &xmm_dst_hi);
2638 /* rebuid the 4 pixel data and save*/
2640 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2649 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2650 _mm_movepi64_pi64 (xmm_alpha),
2651 unpack_32_1x64 (d)));
2659 /* ---------------------------------------------------------------------
2660 * composite_over_n_0565
2663 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2665 pixman_image_t * src_image,
2666 pixman_image_t * mask_image,
2667 pixman_image_t * dst_image,
2678 uint16_t *dst_line, *dst, d;
2681 __m128i xmm_src, xmm_alpha;
2682 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2684 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2689 PIXMAN_IMAGE_GET_LINE (
2690 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2692 xmm_src = expand_pixel_32_1x128 (src);
2693 xmm_alpha = expand_alpha_1x128 (xmm_src);
2699 dst_line += dst_stride;
2702 while (w && (unsigned long)dst & 15)
2706 *dst++ = pack_565_32_16 (
2707 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2708 _mm_movepi64_pi64 (xmm_alpha),
2709 expand565_16_1x64 (d))));
2715 xmm_dst = load_128_aligned ((__m128i*)dst);
2717 unpack_565_128_4x128 (xmm_dst,
2718 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2720 over_2x128 (&xmm_src, &xmm_src,
2721 &xmm_alpha, &xmm_alpha,
2722 &xmm_dst0, &xmm_dst1);
2723 over_2x128 (&xmm_src, &xmm_src,
2724 &xmm_alpha, &xmm_alpha,
2725 &xmm_dst2, &xmm_dst3);
2727 xmm_dst = pack_565_4x128_128 (
2728 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2730 save_128_aligned ((__m128i*)dst, xmm_dst);
2739 *dst++ = pack_565_32_16 (
2740 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2741 _mm_movepi64_pi64 (xmm_alpha),
2742 expand565_16_1x64 (d))));
2749 /* ------------------------------
2750 * composite_add_n_8888_8888_ca
2753 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2755 pixman_image_t * src_image,
2756 pixman_image_t * mask_image,
2757 pixman_image_t * dst_image,
2768 uint32_t *dst_line, d;
2769 uint32_t *mask_line, m;
2771 int dst_stride, mask_stride;
2773 __m128i xmm_src, xmm_alpha;
2775 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2777 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2779 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2785 PIXMAN_IMAGE_GET_LINE (
2786 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2787 PIXMAN_IMAGE_GET_LINE (
2788 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2790 xmm_src = _mm_unpacklo_epi8 (
2791 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2792 xmm_alpha = expand_alpha_1x128 (xmm_src);
2793 mmx_src = _mm_movepi64_pi64 (xmm_src);
2794 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2799 const uint32_t *pm = (uint32_t *)mask_line;
2800 uint32_t *pd = (uint32_t *)dst_line;
2802 dst_line += dst_stride;
2803 mask_line += mask_stride;
2805 while (w && (unsigned long)pd & 15)
2813 mmx_mask = unpack_32_1x64 (m);
2814 mmx_dest = unpack_32_1x64 (d);
2816 *pd = pack_1x64_32 (
2817 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2826 xmm_mask = load_128_unaligned ((__m128i*)pm);
2830 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2832 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2833 if (pack_cmp != 0xffff)
2835 xmm_dst = load_128_aligned ((__m128i*)pd);
2837 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2839 pix_multiply_2x128 (&xmm_src, &xmm_src,
2840 &xmm_mask_lo, &xmm_mask_hi,
2841 &xmm_mask_lo, &xmm_mask_hi);
2842 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2845 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2861 mmx_mask = unpack_32_1x64 (m);
2862 mmx_dest = unpack_32_1x64 (d);
2864 *pd = pack_1x64_32 (
2865 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2876 /* ---------------------------------------------------------------------------
2877 * composite_over_n_8888_8888_ca
2881 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2883 pixman_image_t * src_image,
2884 pixman_image_t * mask_image,
2885 pixman_image_t * dst_image,
2896 uint32_t *dst_line, d;
2897 uint32_t *mask_line, m;
2899 int dst_stride, mask_stride;
2901 __m128i xmm_src, xmm_alpha;
2902 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2903 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2905 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2907 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2912 PIXMAN_IMAGE_GET_LINE (
2913 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2914 PIXMAN_IMAGE_GET_LINE (
2915 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2917 xmm_src = _mm_unpacklo_epi8 (
2918 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2919 xmm_alpha = expand_alpha_1x128 (xmm_src);
2920 mmx_src = _mm_movepi64_pi64 (xmm_src);
2921 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2926 const uint32_t *pm = (uint32_t *)mask_line;
2927 uint32_t *pd = (uint32_t *)dst_line;
2929 dst_line += dst_stride;
2930 mask_line += mask_stride;
2932 while (w && (unsigned long)pd & 15)
2939 mmx_mask = unpack_32_1x64 (m);
2940 mmx_dest = unpack_32_1x64 (d);
2942 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
2954 xmm_mask = load_128_unaligned ((__m128i*)pm);
2958 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2960 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2961 if (pack_cmp != 0xffff)
2963 xmm_dst = load_128_aligned ((__m128i*)pd);
2965 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2966 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2968 in_over_2x128 (&xmm_src, &xmm_src,
2969 &xmm_alpha, &xmm_alpha,
2970 &xmm_mask_lo, &xmm_mask_hi,
2971 &xmm_dst_lo, &xmm_dst_hi);
2974 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2989 mmx_mask = unpack_32_1x64 (m);
2990 mmx_dest = unpack_32_1x64 (d);
2992 *pd = pack_1x64_32 (
2993 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3004 /*---------------------------------------------------------------------
3005 * composite_over_8888_n_8888
3009 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3011 pixman_image_t * src_image,
3012 pixman_image_t * mask_image,
3013 pixman_image_t * dst_image,
3023 uint32_t *dst_line, *dst;
3024 uint32_t *src_line, *src;
3027 int dst_stride, src_stride;
3030 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3031 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3032 __m128i xmm_alpha_lo, xmm_alpha_hi;
3034 PIXMAN_IMAGE_GET_LINE (
3035 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3036 PIXMAN_IMAGE_GET_LINE (
3037 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3039 mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3041 xmm_mask = create_mask_16_128 (mask >> 24);
3046 dst_line += dst_stride;
3048 src_line += src_stride;
3051 while (w && (unsigned long)dst & 15)
3053 uint32_t s = *src++;
3056 __m64 ms = unpack_32_1x64 (s);
3057 __m64 alpha = expand_alpha_1x64 (ms);
3058 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3059 __m64 alpha_dst = unpack_32_1x64 (d);
3061 *dst++ = pack_1x64_32 (
3062 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3069 xmm_src = load_128_unaligned ((__m128i*)src);
3070 xmm_dst = load_128_aligned ((__m128i*)dst);
3072 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3073 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3074 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3075 &xmm_alpha_lo, &xmm_alpha_hi);
3077 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3078 &xmm_alpha_lo, &xmm_alpha_hi,
3079 &xmm_mask, &xmm_mask,
3080 &xmm_dst_lo, &xmm_dst_hi);
3083 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3092 uint32_t s = *src++;
3095 __m64 ms = unpack_32_1x64 (s);
3096 __m64 alpha = expand_alpha_1x64 (ms);
3097 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3098 __m64 dest = unpack_32_1x64 (d);
3100 *dst++ = pack_1x64_32 (
3101 in_over_1x64 (&ms, &alpha, &mask, &dest));
3110 /*---------------------------------------------------------------------
3111 * composite_over_8888_n_8888
3115 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
3117 pixman_image_t * src_image,
3118 pixman_image_t * mask_image,
3119 pixman_image_t * dst_image,
3129 uint32_t *dst_line, *dst;
3130 uint32_t *src_line, *src;
3132 int dst_stride, src_stride;
3135 PIXMAN_IMAGE_GET_LINE (
3136 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3137 PIXMAN_IMAGE_GET_LINE (
3138 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3143 dst_line += dst_stride;
3145 src_line += src_stride;
3148 while (w && (unsigned long)dst & 15)
3150 *dst++ = *src++ | 0xff000000;
3156 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
3158 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
3159 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
3160 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
3161 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
3163 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
3164 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
3165 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
3166 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
3175 *dst++ = *src++ | 0xff000000;
3183 /* ---------------------------------------------------------------------
3184 * composite_over_x888_n_8888
3187 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3189 pixman_image_t * src_image,
3190 pixman_image_t * mask_image,
3191 pixman_image_t * dst_image,
3201 uint32_t *dst_line, *dst;
3202 uint32_t *src_line, *src;
3204 int dst_stride, src_stride;
3207 __m128i xmm_mask, xmm_alpha;
3208 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3209 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3211 PIXMAN_IMAGE_GET_LINE (
3212 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3213 PIXMAN_IMAGE_GET_LINE (
3214 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3216 mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3218 xmm_mask = create_mask_16_128 (mask >> 24);
3219 xmm_alpha = mask_00ff;
3224 dst_line += dst_stride;
3226 src_line += src_stride;
3229 while (w && (unsigned long)dst & 15)
3231 uint32_t s = (*src++) | 0xff000000;
3234 __m64 src = unpack_32_1x64 (s);
3235 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3236 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3237 __m64 dest = unpack_32_1x64 (d);
3239 *dst++ = pack_1x64_32 (
3240 in_over_1x64 (&src, &alpha, &mask, &dest));
3247 xmm_src = _mm_or_si128 (
3248 load_128_unaligned ((__m128i*)src), mask_ff000000);
3249 xmm_dst = load_128_aligned ((__m128i*)dst);
3251 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3252 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3254 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3255 &xmm_alpha, &xmm_alpha,
3256 &xmm_mask, &xmm_mask,
3257 &xmm_dst_lo, &xmm_dst_hi);
3260 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3270 uint32_t s = (*src++) | 0xff000000;
3273 __m64 src = unpack_32_1x64 (s);
3274 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3275 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3276 __m64 dest = unpack_32_1x64 (d);
3278 *dst++ = pack_1x64_32 (
3279 in_over_1x64 (&src, &alpha, &mask, &dest));
3288 /* --------------------------------------------------------------------
3289 * composite_over_8888_8888
3292 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3294 pixman_image_t * src_image,
3295 pixman_image_t * mask_image,
3296 pixman_image_t * dst_image,
3306 int dst_stride, src_stride;
3307 uint32_t *dst_line, *dst;
3308 uint32_t *src_line, *src;
3310 PIXMAN_IMAGE_GET_LINE (
3311 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3312 PIXMAN_IMAGE_GET_LINE (
3313 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3320 core_combine_over_u_sse2 (dst, src, NULL, width);
3328 /* ------------------------------------------------------------------
3329 * composite_over_8888_0565
3331 static force_inline uint16_t
3332 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3336 ms = unpack_32_1x64 (src);
3337 return pack_565_32_16 (
3340 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3344 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3346 pixman_image_t * src_image,
3347 pixman_image_t * mask_image,
3348 pixman_image_t * dst_image,
3358 uint16_t *dst_line, *dst, d;
3359 uint32_t *src_line, *src, s;
3360 int dst_stride, src_stride;
3363 __m128i xmm_alpha_lo, xmm_alpha_hi;
3364 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3365 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3367 PIXMAN_IMAGE_GET_LINE (
3368 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3369 PIXMAN_IMAGE_GET_LINE (
3370 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3375 * I copy the code from MMX one and keep the fixme.
3376 * If it's a problem there, probably is a problem here.
3378 assert (src_image->drawable == mask_image->drawable);
3386 dst_line += dst_stride;
3387 src_line += src_stride;
3390 /* Align dst on a 16-byte boundary */
3392 ((unsigned long)dst & 15))
3397 *dst++ = composite_over_8888_0565pixel (s, d);
3401 /* It's a 8 pixel loop */
3404 /* I'm loading unaligned because I'm not sure
3405 * about the address alignment.
3407 xmm_src = load_128_unaligned ((__m128i*) src);
3408 xmm_dst = load_128_aligned ((__m128i*) dst);
3411 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3412 unpack_565_128_4x128 (xmm_dst,
3413 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3414 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3415 &xmm_alpha_lo, &xmm_alpha_hi);
3417 /* I'm loading next 4 pixels from memory
3418 * before to optimze the memory read.
3420 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3422 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3423 &xmm_alpha_lo, &xmm_alpha_hi,
3424 &xmm_dst0, &xmm_dst1);
3427 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3428 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3429 &xmm_alpha_lo, &xmm_alpha_hi);
3431 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3432 &xmm_alpha_lo, &xmm_alpha_hi,
3433 &xmm_dst2, &xmm_dst3);
3436 (__m128i*)dst, pack_565_4x128_128 (
3437 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3449 *dst++ = composite_over_8888_0565pixel (s, d);
3456 /* -----------------------------------------------------------------
3457 * composite_over_n_8_8888
3461 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3463 pixman_image_t * src_image,
3464 pixman_image_t * mask_image,
3465 pixman_image_t * dst_image,
3476 uint32_t *dst_line, *dst;
3477 uint8_t *mask_line, *mask;
3478 int dst_stride, mask_stride;
3482 __m128i xmm_src, xmm_alpha, xmm_def;
3483 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3484 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3486 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3488 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3494 PIXMAN_IMAGE_GET_LINE (
3495 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3496 PIXMAN_IMAGE_GET_LINE (
3497 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3499 xmm_def = create_mask_2x32_128 (src, src);
3500 xmm_src = expand_pixel_32_1x128 (src);
3501 xmm_alpha = expand_alpha_1x128 (xmm_src);
3502 mmx_src = _mm_movepi64_pi64 (xmm_src);
3503 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3508 dst_line += dst_stride;
3510 mask_line += mask_stride;
3513 while (w && (unsigned long)dst & 15)
3515 uint8_t m = *mask++;
3520 mmx_mask = expand_pixel_8_1x64 (m);
3521 mmx_dest = unpack_32_1x64 (d);
3523 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3535 m = *((uint32_t*)mask);
3537 if (srca == 0xff && m == 0xffffffff)
3539 save_128_aligned ((__m128i*)dst, xmm_def);
3543 xmm_dst = load_128_aligned ((__m128i*) dst);
3544 xmm_mask = unpack_32_1x128 (m);
3545 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3548 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3549 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3551 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3552 &xmm_mask_lo, &xmm_mask_hi);
3554 in_over_2x128 (&xmm_src, &xmm_src,
3555 &xmm_alpha, &xmm_alpha,
3556 &xmm_mask_lo, &xmm_mask_hi,
3557 &xmm_dst_lo, &xmm_dst_hi);
3560 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3570 uint8_t m = *mask++;
3575 mmx_mask = expand_pixel_8_1x64 (m);
3576 mmx_dest = unpack_32_1x64 (d);
3578 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3592 /* ----------------------------------------------------------------
3593 * composite_over_n_8_8888
3597 pixman_fill_sse2 (uint32_t *bits,
3606 uint32_t byte_width;
3616 stride = stride * (int) sizeof (uint32_t) / 1;
3617 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3623 data = (w << 16) | w;
3627 stride = stride * (int) sizeof (uint32_t) / 2;
3628 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3629 byte_width = 2 * width;
3632 data = (data & 0xffff) * 0x00010001;
3636 stride = stride * (int) sizeof (uint32_t) / 4;
3637 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3638 byte_width = 4 * width;
3646 xmm_def = create_mask_2x32_128 (data, data);
3651 uint8_t *d = byte_line;
3652 byte_line += stride;
3655 while (w >= 1 && ((unsigned long)d & 1))
3657 *(uint8_t *)d = data;
3662 while (w >= 2 && ((unsigned long)d & 3))
3664 *(uint16_t *)d = data;
3669 while (w >= 4 && ((unsigned long)d & 15))
3671 *(uint32_t *)d = data;
3679 save_128_aligned ((__m128i*)(d), xmm_def);
3680 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3681 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3682 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3683 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3684 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3685 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3686 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3694 save_128_aligned ((__m128i*)(d), xmm_def);
3695 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3696 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3697 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3705 save_128_aligned ((__m128i*)(d), xmm_def);
3706 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3714 save_128_aligned ((__m128i*)(d), xmm_def);
3722 *(uint32_t *)d = data;
3730 *(uint16_t *)d = data;
3737 *(uint8_t *)d = data;
3748 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3750 pixman_image_t * src_image,
3751 pixman_image_t * mask_image,
3752 pixman_image_t * dst_image,
3763 uint32_t *dst_line, *dst;
3764 uint8_t *mask_line, *mask;
3765 int dst_stride, mask_stride;
3769 __m128i xmm_src, xmm_def;
3770 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3772 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3777 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3778 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3779 dest_x, dest_y, width, height, 0);
3783 PIXMAN_IMAGE_GET_LINE (
3784 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3785 PIXMAN_IMAGE_GET_LINE (
3786 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3788 xmm_def = create_mask_2x32_128 (src, src);
3789 xmm_src = expand_pixel_32_1x128 (src);
3794 dst_line += dst_stride;
3796 mask_line += mask_stride;
3799 while (w && (unsigned long)dst & 15)
3801 uint8_t m = *mask++;
3805 *dst = pack_1x64_32 (
3807 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3820 m = *((uint32_t*)mask);
3822 if (srca == 0xff && m == 0xffffffff)
3824 save_128_aligned ((__m128i*)dst, xmm_def);
3828 xmm_mask = unpack_32_1x128 (m);
3829 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3832 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3834 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3835 &xmm_mask_lo, &xmm_mask_hi);
3837 pix_multiply_2x128 (&xmm_src, &xmm_src,
3838 &xmm_mask_lo, &xmm_mask_hi,
3839 &xmm_mask_lo, &xmm_mask_hi);
3842 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3846 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3856 uint8_t m = *mask++;
3860 *dst = pack_1x64_32 (
3862 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3877 /*-----------------------------------------------------------------------
3878 * composite_over_n_8_0565
3882 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3884 pixman_image_t * src_image,
3885 pixman_image_t * mask_image,
3886 pixman_image_t * dst_image,
3897 uint16_t *dst_line, *dst, d;
3898 uint8_t *mask_line, *mask;
3899 int dst_stride, mask_stride;
3902 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3904 __m128i xmm_src, xmm_alpha;
3905 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3906 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3908 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3914 PIXMAN_IMAGE_GET_LINE (
3915 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3916 PIXMAN_IMAGE_GET_LINE (
3917 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3919 xmm_src = expand_pixel_32_1x128 (src);
3920 xmm_alpha = expand_alpha_1x128 (xmm_src);
3921 mmx_src = _mm_movepi64_pi64 (xmm_src);
3922 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3927 dst_line += dst_stride;
3929 mask_line += mask_stride;
3932 while (w && (unsigned long)dst & 15)
3939 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
3940 mmx_dest = expand565_16_1x64 (d);
3942 *dst = pack_565_32_16 (
3945 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3954 xmm_dst = load_128_aligned ((__m128i*) dst);
3955 unpack_565_128_4x128 (xmm_dst,
3956 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3958 m = *((uint32_t*)mask);
3963 xmm_mask = unpack_32_1x128 (m);
3964 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3967 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3969 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3970 &xmm_mask_lo, &xmm_mask_hi);
3972 in_over_2x128 (&xmm_src, &xmm_src,
3973 &xmm_alpha, &xmm_alpha,
3974 &xmm_mask_lo, &xmm_mask_hi,
3975 &xmm_dst0, &xmm_dst1);
3978 m = *((uint32_t*)mask);
3983 xmm_mask = unpack_32_1x128 (m);
3984 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3987 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3989 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3990 &xmm_mask_lo, &xmm_mask_hi);
3991 in_over_2x128 (&xmm_src, &xmm_src,
3992 &xmm_alpha, &xmm_alpha,
3993 &xmm_mask_lo, &xmm_mask_hi,
3994 &xmm_dst2, &xmm_dst3);
3998 (__m128i*)dst, pack_565_4x128_128 (
3999 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4012 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4013 mmx_dest = expand565_16_1x64 (d);
4015 *dst = pack_565_32_16 (
4018 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4029 /* -----------------------------------------------------------------------
4030 * composite_over_pixbuf_0565
4034 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4036 pixman_image_t * src_image,
4037 pixman_image_t * mask_image,
4038 pixman_image_t * dst_image,
4048 uint16_t *dst_line, *dst, d;
4049 uint32_t *src_line, *src, s;
4050 int dst_stride, src_stride;
4052 uint32_t opaque, zero;
4055 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4056 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4058 PIXMAN_IMAGE_GET_LINE (
4059 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4060 PIXMAN_IMAGE_GET_LINE (
4061 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4066 * I copy the code from MMX one and keep the fixme.
4067 * If it's a problem there, probably is a problem here.
4069 assert (src_image->drawable == mask_image->drawable);
4075 dst_line += dst_stride;
4077 src_line += src_stride;
4080 while (w && (unsigned long)dst & 15)
4085 ms = unpack_32_1x64 (s);
4087 *dst++ = pack_565_32_16 (
4089 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4096 xmm_src = load_128_unaligned ((__m128i*)src);
4097 xmm_dst = load_128_aligned ((__m128i*)dst);
4099 opaque = is_opaque (xmm_src);
4100 zero = is_zero (xmm_src);
4102 unpack_565_128_4x128 (xmm_dst,
4103 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4104 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4106 /* preload next round*/
4107 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4111 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4112 &xmm_dst0, &xmm_dst1);
4116 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4117 &xmm_dst0, &xmm_dst1);
4121 opaque = is_opaque (xmm_src);
4122 zero = is_zero (xmm_src);
4124 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4128 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4129 &xmm_dst2, &xmm_dst3);
4133 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4134 &xmm_dst2, &xmm_dst3);
4138 (__m128i*)dst, pack_565_4x128_128 (
4139 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4151 ms = unpack_32_1x64 (s);
4153 *dst++ = pack_565_32_16 (
4155 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4163 /* -------------------------------------------------------------------------
4164 * composite_over_pixbuf_8888
4168 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4170 pixman_image_t * src_image,
4171 pixman_image_t * mask_image,
4172 pixman_image_t * dst_image,
4182 uint32_t *dst_line, *dst, d;
4183 uint32_t *src_line, *src, s;
4184 int dst_stride, src_stride;
4186 uint32_t opaque, zero;
4188 __m128i xmm_src_lo, xmm_src_hi;
4189 __m128i xmm_dst_lo, xmm_dst_hi;
4191 PIXMAN_IMAGE_GET_LINE (
4192 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4193 PIXMAN_IMAGE_GET_LINE (
4194 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4199 * I copy the code from MMX one and keep the fixme.
4200 * If it's a problem there, probably is a problem here.
4202 assert (src_image->drawable == mask_image->drawable);
4208 dst_line += dst_stride;
4210 src_line += src_stride;
4213 while (w && (unsigned long)dst & 15)
4218 *dst++ = pack_1x64_32 (
4219 over_rev_non_pre_1x64 (
4220 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4227 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4229 opaque = is_opaque (xmm_src_hi);
4230 zero = is_zero (xmm_src_hi);
4232 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4236 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4237 &xmm_dst_lo, &xmm_dst_hi);
4240 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4244 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4246 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4248 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4249 &xmm_dst_lo, &xmm_dst_hi);
4252 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4265 *dst++ = pack_1x64_32 (
4266 over_rev_non_pre_1x64 (
4267 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4276 /* -------------------------------------------------------------------------------------------------
4277 * composite_over_n_8888_0565_ca
4281 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4283 pixman_image_t * src_image,
4284 pixman_image_t * mask_image,
4285 pixman_image_t * dst_image,
4296 uint16_t *dst_line, *dst, d;
4297 uint32_t *mask_line, *mask, m;
4298 int dst_stride, mask_stride;
4302 __m128i xmm_src, xmm_alpha;
4303 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4304 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4306 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4308 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4313 PIXMAN_IMAGE_GET_LINE (
4314 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4315 PIXMAN_IMAGE_GET_LINE (
4316 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4318 xmm_src = expand_pixel_32_1x128 (src);
4319 xmm_alpha = expand_alpha_1x128 (xmm_src);
4320 mmx_src = _mm_movepi64_pi64 (xmm_src);
4321 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4328 mask_line += mask_stride;
4329 dst_line += dst_stride;
4331 while (w && ((unsigned long)dst & 15))
4333 m = *(uint32_t *) mask;
4338 mmx_mask = unpack_32_1x64 (m);
4339 mmx_dest = expand565_16_1x64 (d);
4341 *dst = pack_565_32_16 (
4344 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4355 xmm_mask = load_128_unaligned ((__m128i*)mask);
4356 xmm_dst = load_128_aligned ((__m128i*)dst);
4358 pack_cmp = _mm_movemask_epi8 (
4359 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4361 unpack_565_128_4x128 (xmm_dst,
4362 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4363 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4365 /* preload next round */
4366 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4368 /* preload next round */
4369 if (pack_cmp != 0xffff)
4371 in_over_2x128 (&xmm_src, &xmm_src,
4372 &xmm_alpha, &xmm_alpha,
4373 &xmm_mask_lo, &xmm_mask_hi,
4374 &xmm_dst0, &xmm_dst1);
4378 pack_cmp = _mm_movemask_epi8 (
4379 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4381 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4383 if (pack_cmp != 0xffff)
4385 in_over_2x128 (&xmm_src, &xmm_src,
4386 &xmm_alpha, &xmm_alpha,
4387 &xmm_mask_lo, &xmm_mask_hi,
4388 &xmm_dst2, &xmm_dst3);
4392 (__m128i*)dst, pack_565_4x128_128 (
4393 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4402 m = *(uint32_t *) mask;
4407 mmx_mask = unpack_32_1x64 (m);
4408 mmx_dest = expand565_16_1x64 (d);
4410 *dst = pack_565_32_16 (
4413 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4425 /* -----------------------------------------------------------------------
4426 * composite_in_n_8_8
4430 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4432 pixman_image_t * src_image,
4433 pixman_image_t * mask_image,
4434 pixman_image_t * dst_image,
4444 uint8_t *dst_line, *dst;
4445 uint8_t *mask_line, *mask;
4446 int dst_stride, mask_stride;
4453 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4454 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4456 PIXMAN_IMAGE_GET_LINE (
4457 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4458 PIXMAN_IMAGE_GET_LINE (
4459 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4461 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4465 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4470 dst_line += dst_stride;
4472 mask_line += mask_stride;
4475 while (w && ((unsigned long)dst & 15))
4477 m = (uint32_t) *mask++;
4478 d = (uint32_t) *dst;
4480 *dst++ = (uint8_t) pack_1x64_32 (
4482 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4483 unpack_32_1x64 (m)),
4484 unpack_32_1x64 (d)));
4490 xmm_mask = load_128_unaligned ((__m128i*)mask);
4491 xmm_dst = load_128_aligned ((__m128i*)dst);
4493 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4494 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4496 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4497 &xmm_mask_lo, &xmm_mask_hi,
4498 &xmm_mask_lo, &xmm_mask_hi);
4500 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4501 &xmm_dst_lo, &xmm_dst_hi,
4502 &xmm_dst_lo, &xmm_dst_hi);
4505 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4514 m = (uint32_t) *mask++;
4515 d = (uint32_t) *dst;
4517 *dst++ = (uint8_t) pack_1x64_32 (
4520 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4521 unpack_32_1x64 (d)));
4529 /* -----------------------------------------------------------------------
4534 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4536 pixman_image_t * src_image,
4537 pixman_image_t * mask_image,
4538 pixman_image_t * dst_image,
4548 uint8_t *dst_line, *dst;
4555 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4557 PIXMAN_IMAGE_GET_LINE (
4558 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4560 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4562 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4571 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4572 8, dest_x, dest_y, width, height, src);
4580 dst_line += dst_stride;
4583 while (w && ((unsigned long)dst & 15))
4585 d = (uint32_t) *dst;
4587 *dst++ = (uint8_t) pack_1x64_32 (
4589 _mm_movepi64_pi64 (xmm_alpha),
4590 unpack_32_1x64 (d)));
4596 xmm_dst = load_128_aligned ((__m128i*)dst);
4598 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4600 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4601 &xmm_dst_lo, &xmm_dst_hi,
4602 &xmm_dst_lo, &xmm_dst_hi);
4605 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4613 d = (uint32_t) *dst;
4615 *dst++ = (uint8_t) pack_1x64_32 (
4617 _mm_movepi64_pi64 (xmm_alpha),
4618 unpack_32_1x64 (d)));
4626 /* ---------------------------------------------------------------------------
4631 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4633 pixman_image_t * src_image,
4634 pixman_image_t * mask_image,
4635 pixman_image_t * dst_image,
4645 uint8_t *dst_line, *dst;
4646 uint8_t *src_line, *src;
4647 int src_stride, dst_stride;
4651 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4652 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4654 PIXMAN_IMAGE_GET_LINE (
4655 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4656 PIXMAN_IMAGE_GET_LINE (
4657 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4662 dst_line += dst_stride;
4664 src_line += src_stride;
4667 while (w && ((unsigned long)dst & 15))
4669 s = (uint32_t) *src++;
4670 d = (uint32_t) *dst;
4672 *dst++ = (uint8_t) pack_1x64_32 (
4674 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4680 xmm_src = load_128_unaligned ((__m128i*)src);
4681 xmm_dst = load_128_aligned ((__m128i*)dst);
4683 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4684 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4686 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4687 &xmm_dst_lo, &xmm_dst_hi,
4688 &xmm_dst_lo, &xmm_dst_hi);
4691 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4700 s = (uint32_t) *src++;
4701 d = (uint32_t) *dst;
4703 *dst++ = (uint8_t) pack_1x64_32 (
4704 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4712 /* -------------------------------------------------------------------------
4713 * composite_add_n_8_8
4717 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4719 pixman_image_t * src_image,
4720 pixman_image_t * mask_image,
4721 pixman_image_t * dst_image,
4731 uint8_t *dst_line, *dst;
4732 uint8_t *mask_line, *mask;
4733 int dst_stride, mask_stride;
4740 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4741 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4743 PIXMAN_IMAGE_GET_LINE (
4744 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4745 PIXMAN_IMAGE_GET_LINE (
4746 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4748 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4752 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4757 dst_line += dst_stride;
4759 mask_line += mask_stride;
4762 while (w && ((unsigned long)dst & 15))
4764 m = (uint32_t) *mask++;
4765 d = (uint32_t) *dst;
4767 *dst++ = (uint8_t) pack_1x64_32 (
4770 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4771 unpack_32_1x64 (d)));
4777 xmm_mask = load_128_unaligned ((__m128i*)mask);
4778 xmm_dst = load_128_aligned ((__m128i*)dst);
4780 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4781 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4783 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4784 &xmm_mask_lo, &xmm_mask_hi,
4785 &xmm_mask_lo, &xmm_mask_hi);
4787 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4788 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4791 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4800 m = (uint32_t) *mask++;
4801 d = (uint32_t) *dst;
4803 *dst++ = (uint8_t) pack_1x64_32 (
4806 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4807 unpack_32_1x64 (d)));
4816 /* -------------------------------------------------------------------------
4817 * composite_add_n_8_8
4821 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4823 pixman_image_t * src_image,
4824 pixman_image_t * mask_image,
4825 pixman_image_t * dst_image,
4835 uint8_t *dst_line, *dst;
4842 PIXMAN_IMAGE_GET_LINE (
4843 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4845 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4854 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4855 8, dest_x, dest_y, width, height, 0xff);
4860 src = (src << 24) | (src << 16) | (src << 8) | src;
4861 xmm_src = _mm_set_epi32 (src, src, src, src);
4866 dst_line += dst_stride;
4869 while (w && ((unsigned long)dst & 15))
4871 *dst = (uint8_t)_mm_cvtsi64_si32 (
4873 _mm_movepi64_pi64 (xmm_src),
4874 _mm_cvtsi32_si64 (*dst)));
4883 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4891 *dst = (uint8_t)_mm_cvtsi64_si32 (
4893 _mm_movepi64_pi64 (xmm_src),
4894 _mm_cvtsi32_si64 (*dst)));
4904 /* ----------------------------------------------------------------------
4905 * composite_add_8000_8000
4909 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
4911 pixman_image_t * src_image,
4912 pixman_image_t * mask_image,
4913 pixman_image_t * dst_image,
4923 uint8_t *dst_line, *dst;
4924 uint8_t *src_line, *src;
4925 int dst_stride, src_stride;
4929 PIXMAN_IMAGE_GET_LINE (
4930 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4931 PIXMAN_IMAGE_GET_LINE (
4932 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4939 dst_line += dst_stride;
4940 src_line += src_stride;
4944 while (w && (unsigned long)dst & 3)
4946 t = (*dst) + (*src++);
4947 *dst++ = t | (0 - (t >> 8));
4951 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4961 t = (*dst) + (*src++);
4962 *dst++ = t | (0 - (t >> 8));
4970 /* ---------------------------------------------------------------------
4971 * composite_add_8888_8888
4974 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4976 pixman_image_t * src_image,
4977 pixman_image_t * mask_image,
4978 pixman_image_t * dst_image,
4988 uint32_t *dst_line, *dst;
4989 uint32_t *src_line, *src;
4990 int dst_stride, src_stride;
4992 PIXMAN_IMAGE_GET_LINE (
4993 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4994 PIXMAN_IMAGE_GET_LINE (
4995 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5000 dst_line += dst_stride;
5002 src_line += src_stride;
5004 core_combine_add_u_sse2 (dst, src, NULL, width);
5010 /* -------------------------------------------------------------------------------------------------
5011 * sse2_composite_copy_area
5014 static pixman_bool_t
5015 pixman_blt_sse2 (uint32_t *src_bits,
5028 uint8_t * src_bytes;
5029 uint8_t * dst_bytes;
5032 if (src_bpp != dst_bpp)
5037 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5038 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5039 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5040 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5041 byte_width = 2 * width;
5045 else if (src_bpp == 32)
5047 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5048 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5049 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5050 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5051 byte_width = 4 * width;
5063 uint8_t *s = src_bytes;
5064 uint8_t *d = dst_bytes;
5065 src_bytes += src_stride;
5066 dst_bytes += dst_stride;
5069 while (w >= 2 && ((unsigned long)d & 3))
5071 *(uint16_t *)d = *(uint16_t *)s;
5077 while (w >= 4 && ((unsigned long)d & 15))
5079 *(uint32_t *)d = *(uint32_t *)s;
5088 __m128i xmm0, xmm1, xmm2, xmm3;
5090 xmm0 = load_128_unaligned ((__m128i*)(s));
5091 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5092 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5093 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5095 save_128_aligned ((__m128i*)(d), xmm0);
5096 save_128_aligned ((__m128i*)(d + 16), xmm1);
5097 save_128_aligned ((__m128i*)(d + 32), xmm2);
5098 save_128_aligned ((__m128i*)(d + 48), xmm3);
5107 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5116 *(uint32_t *)d = *(uint32_t *)s;
5125 *(uint16_t *)d = *(uint16_t *)s;
5138 sse2_composite_copy_area (pixman_implementation_t *imp,
5140 pixman_image_t * src_image,
5141 pixman_image_t * mask_image,
5142 pixman_image_t * dst_image,
5152 pixman_blt_sse2 (src_image->bits.bits,
5153 dst_image->bits.bits,
5154 src_image->bits.rowstride,
5155 dst_image->bits.rowstride,
5156 PIXMAN_FORMAT_BPP (src_image->bits.format),
5157 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5158 src_x, src_y, dest_x, dest_y, width, height);
5162 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5164 pixman_image_t * src_image,
5165 pixman_image_t * mask_image,
5166 pixman_image_t * dst_image,
5176 uint32_t *src, *src_line, s;
5177 uint32_t *dst, *dst_line, d;
5178 uint8_t *mask, *mask_line;
5180 int src_stride, mask_stride, dst_stride;
5184 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5185 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5186 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5188 PIXMAN_IMAGE_GET_LINE (
5189 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5190 PIXMAN_IMAGE_GET_LINE (
5191 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5192 PIXMAN_IMAGE_GET_LINE (
5193 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5198 src_line += src_stride;
5200 dst_line += dst_stride;
5202 mask_line += mask_stride;
5206 while (w && (unsigned long)dst & 15)
5208 s = 0xff000000 | *src++;
5209 m = (uint32_t) *mask++;
5211 ms = unpack_32_1x64 (s);
5215 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5216 __m64 md = unpack_32_1x64 (d);
5218 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5221 *dst++ = pack_1x64_32 (ms);
5227 m = *(uint32_t*) mask;
5228 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5230 if (m == 0xffffffff)
5232 save_128_aligned ((__m128i*)dst, xmm_src);
5236 xmm_dst = load_128_aligned ((__m128i*)dst);
5238 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5240 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5241 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5242 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5244 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5246 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5248 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5259 m = (uint32_t) *mask++;
5263 s = 0xff000000 | *src;
5275 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5276 md = unpack_32_1x64 (d);
5277 ms = unpack_32_1x64 (s);
5279 *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5294 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5296 pixman_image_t * src_image,
5297 pixman_image_t * mask_image,
5298 pixman_image_t * dst_image,
5308 uint32_t *src, *src_line, s;
5309 uint32_t *dst, *dst_line, d;
5310 uint8_t *mask, *mask_line;
5312 int src_stride, mask_stride, dst_stride;
5315 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5316 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5317 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5319 PIXMAN_IMAGE_GET_LINE (
5320 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5321 PIXMAN_IMAGE_GET_LINE (
5322 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5323 PIXMAN_IMAGE_GET_LINE (
5324 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5329 src_line += src_stride;
5331 dst_line += dst_stride;
5333 mask_line += mask_stride;
5337 while (w && (unsigned long)dst & 15)
5342 m = (uint32_t) *mask++;
5349 if (sa == 0xff && m == 0xff)
5355 __m64 ms, md, ma, msa;
5357 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5358 ms = unpack_32_1x64 (s);
5359 md = unpack_32_1x64 (d);
5361 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5363 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5373 m = *(uint32_t *) mask;
5377 xmm_src = load_128_unaligned ((__m128i*)src);
5379 if (m == 0xffffffff && is_opaque (xmm_src))
5381 save_128_aligned ((__m128i *)dst, xmm_src);
5385 xmm_dst = load_128_aligned ((__m128i *)dst);
5387 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5389 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5390 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5391 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5393 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5394 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5396 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5397 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5399 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5414 m = (uint32_t) *mask++;
5421 if (sa == 0xff && m == 0xff)
5427 __m64 ms, md, ma, msa;
5429 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5430 ms = unpack_32_1x64 (s);
5431 md = unpack_32_1x64 (d);
5433 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5435 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5448 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5450 pixman_image_t * src_image,
5451 pixman_image_t * mask_image,
5452 pixman_image_t * dst_image,
5463 uint32_t *dst_line, *dst;
5465 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5466 __m128i xmm_dsta_hi, xmm_dsta_lo;
5470 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5475 PIXMAN_IMAGE_GET_LINE (
5476 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5478 xmm_src = expand_pixel_32_1x128 (src);
5484 dst_line += dst_stride;
5487 while (w && (unsigned long)dst & 15)
5491 vd = unpack_32_1x64 (*dst);
5493 *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5494 _mm_movepi64_pi64 (xmm_src)));
5501 __m128i tmp_lo, tmp_hi;
5503 xmm_dst = load_128_aligned ((__m128i*)dst);
5505 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5506 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5511 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5512 &xmm_dsta_lo, &xmm_dsta_hi,
5516 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5526 vd = unpack_32_1x64 (*dst);
5528 *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5529 _mm_movepi64_pi64 (xmm_src)));
5540 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5542 pixman_image_t * src_image,
5543 pixman_image_t * mask_image,
5544 pixman_image_t * dst_image,
5554 uint32_t *src, *src_line, s;
5555 uint32_t *dst, *dst_line, d;
5556 uint32_t *mask, *mask_line;
5558 int src_stride, mask_stride, dst_stride;
5561 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5562 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5563 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5565 PIXMAN_IMAGE_GET_LINE (
5566 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5567 PIXMAN_IMAGE_GET_LINE (
5568 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5569 PIXMAN_IMAGE_GET_LINE (
5570 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5575 src_line += src_stride;
5577 dst_line += dst_stride;
5579 mask_line += mask_stride;
5583 while (w && (unsigned long)dst & 15)
5588 m = (*mask++) >> 24;
5595 if (sa == 0xff && m == 0xff)
5601 __m64 ms, md, ma, msa;
5603 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5604 ms = unpack_32_1x64 (s);
5605 md = unpack_32_1x64 (d);
5607 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5609 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5619 xmm_mask = load_128_unaligned ((__m128i*)mask);
5621 if (!is_transparent (xmm_mask))
5623 xmm_src = load_128_unaligned ((__m128i*)src);
5625 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5627 save_128_aligned ((__m128i *)dst, xmm_src);
5631 xmm_dst = load_128_aligned ((__m128i *)dst);
5633 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5634 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5635 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5637 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5638 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5640 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5641 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5643 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5658 m = (*mask++) >> 24;
5665 if (sa == 0xff && m == 0xff)
5671 __m64 ms, md, ma, msa;
5673 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5674 ms = unpack_32_1x64 (s);
5675 md = unpack_32_1x64 (d);
5677 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5679 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5691 /* A variant of 'core_combine_over_u_sse2' with minor tweaks */
5692 static force_inline void
5693 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5697 pixman_fixed_t unit_x,
5698 pixman_fixed_t max_vx)
5701 const uint32_t* pm = NULL;
5703 __m128i xmm_dst_lo, xmm_dst_hi;
5704 __m128i xmm_src_lo, xmm_src_hi;
5705 __m128i xmm_alpha_lo, xmm_alpha_hi;
5707 /* Align dst on a 16-byte boundary */
5708 while (w && ((unsigned long)pd & 15))
5711 s = combine1 (ps + (vx >> 16), pm);
5714 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5723 uint32_t tmp1, tmp2, tmp3, tmp4;
5725 tmp1 = ps[vx >> 16];
5727 tmp2 = ps[vx >> 16];
5729 tmp3 = ps[vx >> 16];
5731 tmp4 = ps[vx >> 16];
5734 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5736 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5738 if (is_opaque (xmm_src_hi))
5740 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5742 else if (!is_zero (xmm_src_hi))
5744 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5746 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5747 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5749 expand_alpha_2x128 (
5750 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5752 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5753 &xmm_alpha_lo, &xmm_alpha_hi,
5754 &xmm_dst_lo, &xmm_dst_hi);
5756 /* rebuid the 4 pixel data and save*/
5757 save_128_aligned ((__m128i*)pd,
5758 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5770 s = combine1 (ps + (vx >> 16), pm);
5773 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5782 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5783 scaled_nearest_scanline_sse2_8888_8888_OVER,
5784 uint32_t, uint32_t, COVER);
5785 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5786 scaled_nearest_scanline_sse2_8888_8888_OVER,
5787 uint32_t, uint32_t, NONE);
5788 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5789 scaled_nearest_scanline_sse2_8888_8888_OVER,
5790 uint32_t, uint32_t, PAD);
5792 static const pixman_fast_path_t sse2_fast_paths[] =
5794 /* PIXMAN_OP_OVER */
5795 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5796 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5797 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5798 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5799 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5800 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5801 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5802 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5803 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5804 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5805 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5806 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5807 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5808 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5809 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5810 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5811 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5812 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5813 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5814 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5815 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5816 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5817 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5818 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5819 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5820 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5821 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5822 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5823 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5824 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5825 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5826 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5827 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5828 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5829 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5830 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5831 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5832 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5833 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5834 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5835 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5836 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5837 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5838 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5839 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5840 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5842 /* PIXMAN_OP_OVER_REVERSE */
5843 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5844 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5847 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5848 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
5849 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5850 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5851 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5852 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5855 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5856 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5857 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5858 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5859 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5860 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5861 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5862 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5863 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5864 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5865 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5866 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5867 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5868 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5871 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5872 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5873 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5875 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5876 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5877 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5878 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5879 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5880 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5881 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5882 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5883 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5884 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5885 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5886 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5891 static pixman_bool_t
5892 sse2_blt (pixman_implementation_t *imp,
5893 uint32_t * src_bits,
5894 uint32_t * dst_bits,
5906 if (!pixman_blt_sse2 (
5907 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5908 src_x, src_y, dst_x, dst_y, width, height))
5911 return _pixman_implementation_blt (
5913 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5914 src_x, src_y, dst_x, dst_y, width, height);
5920 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5921 __attribute__((__force_align_arg_pointer__))
5923 static pixman_bool_t
5924 sse2_fill (pixman_implementation_t *imp,
5934 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5936 return _pixman_implementation_fill (
5937 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5943 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5944 __attribute__((__force_align_arg_pointer__))
5946 pixman_implementation_t *
5947 _pixman_implementation_create_sse2 (void)
5950 pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
5952 pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
5954 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
5956 /* SSE2 constants */
5957 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5958 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5959 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5960 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5961 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5962 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5963 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5964 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5965 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
5966 mask_0080 = create_mask_16_128 (0x0080);
5967 mask_00ff = create_mask_16_128 (0x00ff);
5968 mask_0101 = create_mask_16_128 (0x0101);
5969 mask_ffff = create_mask_16_128 (0xffff);
5970 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5971 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5974 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5975 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5977 mask_x0080 = create_mask_16_64 (0x0080);
5978 mask_x00ff = create_mask_16_64 (0x00ff);
5979 mask_x0101 = create_mask_16_64 (0x0101);
5980 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5984 /* Set up function pointers */
5986 /* SSE code patch for fbcompose.c */
5987 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5988 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5989 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5990 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5991 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5992 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5993 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5994 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5995 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5996 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5998 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6000 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6001 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6002 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6003 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6004 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6005 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6006 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6007 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6008 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6009 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6010 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6012 imp->blt = sse2_blt;
6013 imp->fill = sse2_fill;
6018 #endif /* USE_SSE2 */