2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38 #include "pixman-fast-path.h"
40 #if defined(_MSC_VER) && defined(_M_AMD64)
41 /* Windows 64 doesn't allow MMX to be used, so
42 * the pixman-x64-mmx-emulation.h file contains
43 * implementations of those MMX intrinsics that
44 * are used in the SSE2 implementation.
46 # include "pixman-x64-mmx-emulation.h"
51 /* --------------------------------------------------------------------
55 static __m64 mask_x0080;
56 static __m64 mask_x00ff;
57 static __m64 mask_x0101;
58 static __m64 mask_x_alpha;
60 static __m64 mask_x565_rgb;
61 static __m64 mask_x565_unpack;
63 static __m128i mask_0080;
64 static __m128i mask_00ff;
65 static __m128i mask_0101;
66 static __m128i mask_ffff;
67 static __m128i mask_ff000000;
68 static __m128i mask_alpha;
70 static __m128i mask_565_r;
71 static __m128i mask_565_g1, mask_565_g2;
72 static __m128i mask_565_b;
73 static __m128i mask_red;
74 static __m128i mask_green;
75 static __m128i mask_blue;
77 static __m128i mask_565_fix_rb;
78 static __m128i mask_565_fix_g;
80 /* ----------------------------------------------------------------------
83 static force_inline __m128i
84 unpack_32_1x128 (uint32_t data)
86 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
89 static force_inline void
90 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
92 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
93 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
96 static force_inline __m128i
97 unpack_565_to_8888 (__m128i lo)
99 __m128i r, g, b, rb, t;
101 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
102 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
103 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
105 rb = _mm_or_si128 (r, b);
106 t = _mm_and_si128 (rb, mask_565_fix_rb);
107 t = _mm_srli_epi32 (t, 5);
108 rb = _mm_or_si128 (rb, t);
110 t = _mm_and_si128 (g, mask_565_fix_g);
111 t = _mm_srli_epi32 (t, 6);
112 g = _mm_or_si128 (g, t);
114 return _mm_or_si128 (rb, g);
117 static force_inline void
118 unpack_565_128_4x128 (__m128i data,
126 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
127 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
129 lo = unpack_565_to_8888 (lo);
130 hi = unpack_565_to_8888 (hi);
132 unpack_128_2x128 (lo, data0, data1);
133 unpack_128_2x128 (hi, data2, data3);
136 static force_inline uint16_t
137 pack_565_32_16 (uint32_t pixel)
139 return (uint16_t) (((pixel >> 8) & 0xf800) |
140 ((pixel >> 5) & 0x07e0) |
141 ((pixel >> 3) & 0x001f));
144 static force_inline __m128i
145 pack_2x128_128 (__m128i lo, __m128i hi)
147 return _mm_packus_epi16 (lo, hi);
150 static force_inline __m128i
151 pack_565_2x128_128 (__m128i lo, __m128i hi)
154 __m128i r, g1, g2, b;
156 data = pack_2x128_128 (lo, hi);
158 r = _mm_and_si128 (data, mask_565_r);
159 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
160 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
161 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
163 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
166 static force_inline __m128i
167 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
169 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
170 pack_565_2x128_128 (*xmm2, *xmm3));
173 static force_inline int
174 is_opaque (__m128i x)
176 __m128i ffs = _mm_cmpeq_epi8 (x, x);
178 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
181 static force_inline int
184 return _mm_movemask_epi8 (
185 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
188 static force_inline int
189 is_transparent (__m128i x)
191 return (_mm_movemask_epi8 (
192 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
195 static force_inline __m128i
196 expand_pixel_32_1x128 (uint32_t data)
198 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
201 static force_inline __m128i
202 expand_alpha_1x128 (__m128i data)
204 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
205 _MM_SHUFFLE (3, 3, 3, 3)),
206 _MM_SHUFFLE (3, 3, 3, 3));
209 static force_inline void
210 expand_alpha_2x128 (__m128i data_lo,
217 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
218 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
221 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
224 static force_inline void
225 expand_alpha_rev_2x128 (__m128i data_lo,
232 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
233 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
235 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
238 static force_inline void
239 pix_multiply_2x128 (__m128i* data_lo,
248 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
249 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
250 lo = _mm_adds_epu16 (lo, mask_0080);
251 hi = _mm_adds_epu16 (hi, mask_0080);
252 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
253 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
256 static force_inline void
257 pix_add_multiply_2x128 (__m128i* src_lo,
259 __m128i* alpha_dst_lo,
260 __m128i* alpha_dst_hi,
263 __m128i* alpha_src_lo,
264 __m128i* alpha_src_hi,
268 __m128i t1_lo, t1_hi;
269 __m128i t2_lo, t2_hi;
271 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
272 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
274 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
275 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
278 static force_inline void
279 negate_2x128 (__m128i data_lo,
284 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
285 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
288 static force_inline void
289 invert_colors_2x128 (__m128i data_lo,
296 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
297 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
298 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
299 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
302 static force_inline void
303 over_2x128 (__m128i* src_lo,
312 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
314 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
316 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
317 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
320 static force_inline void
321 over_rev_non_pre_2x128 (__m128i src_lo,
327 __m128i alpha_lo, alpha_hi;
329 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
331 lo = _mm_or_si128 (alpha_lo, mask_alpha);
332 hi = _mm_or_si128 (alpha_hi, mask_alpha);
334 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
336 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
338 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
341 static force_inline void
342 in_over_2x128 (__m128i* src_lo,
354 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
355 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
357 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
360 /* load 4 pixels from a 16-byte boundary aligned address */
361 static force_inline __m128i
362 load_128_aligned (__m128i* src)
364 return _mm_load_si128 (src);
367 /* load 4 pixels from a unaligned address */
368 static force_inline __m128i
369 load_128_unaligned (const __m128i* src)
371 return _mm_loadu_si128 (src);
374 /* save 4 pixels using Write Combining memory on a 16-byte
375 * boundary aligned address
377 static force_inline void
378 save_128_write_combining (__m128i* dst,
381 _mm_stream_si128 (dst, data);
384 /* save 4 pixels on a 16-byte boundary aligned address */
385 static force_inline void
386 save_128_aligned (__m128i* dst,
389 _mm_store_si128 (dst, data);
392 /* save 4 pixels on a unaligned address */
393 static force_inline void
394 save_128_unaligned (__m128i* dst,
397 _mm_storeu_si128 (dst, data);
400 /* ------------------------------------------------------------------
404 static force_inline __m64
405 load_32_1x64 (uint32_t data)
407 return _mm_cvtsi32_si64 (data);
410 static force_inline __m64
411 unpack_32_1x64 (uint32_t data)
413 return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
416 static force_inline __m64
417 expand_alpha_1x64 (__m64 data)
419 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
422 static force_inline __m64
423 expand_alpha_rev_1x64 (__m64 data)
425 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
428 static force_inline __m64
429 expand_pixel_8_1x64 (uint8_t data)
431 return _mm_shuffle_pi16 (
432 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
435 static force_inline __m64
436 pix_multiply_1x64 (__m64 data,
439 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
444 static force_inline __m64
445 pix_add_multiply_1x64 (__m64* src,
450 __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
451 __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
453 return _mm_adds_pu8 (t1, t2);
456 static force_inline __m64
457 negate_1x64 (__m64 data)
459 return _mm_xor_si64 (data, mask_x00ff);
462 static force_inline __m64
463 invert_colors_1x64 (__m64 data)
465 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
468 static force_inline __m64
469 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
471 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
474 static force_inline __m64
475 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
477 return over_1x64 (pix_multiply_1x64 (*src, *mask),
478 pix_multiply_1x64 (*alpha, *mask),
482 static force_inline __m64
483 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
485 __m64 alpha = expand_alpha_1x64 (src);
487 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
488 _mm_or_si64 (alpha, mask_x_alpha)),
493 static force_inline uint32_t
494 pack_1x64_32 (__m64 data)
496 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
499 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
503 * --- Expanding 565 in the low word ---
505 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
506 * m = m & (01f0003f001f);
507 * m = m * (008404100840);
510 * Note the trick here - the top word is shifted by another nibble to
511 * avoid it bumping into the middle word
513 static force_inline __m64
514 expand565_16_1x64 (uint16_t pixel)
519 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
521 t1 = _mm_slli_si64 (p, 36 - 11);
522 t2 = _mm_slli_si64 (p, 16 - 5);
524 p = _mm_or_si64 (t1, p);
525 p = _mm_or_si64 (t2, p);
526 p = _mm_and_si64 (p, mask_x565_rgb);
527 p = _mm_mullo_pi16 (p, mask_x565_unpack);
529 return _mm_srli_pi16 (p, 8);
532 /* ----------------------------------------------------------------------------
533 * Compose Core transformations
535 static force_inline uint32_t
536 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
549 ms = unpack_32_1x64 (src);
550 return pack_1x64_32 (
551 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
557 static force_inline uint32_t
558 combine1 (const uint32_t *ps, const uint32_t *pm)
566 mm = unpack_32_1x64 (*pm);
567 mm = expand_alpha_1x64 (mm);
569 ms = unpack_32_1x64 (s);
570 ms = pix_multiply_1x64 (ms, mm);
572 s = pack_1x64_32 (ms);
578 static force_inline __m128i
579 combine4 (const __m128i *ps, const __m128i *pm)
581 __m128i xmm_src_lo, xmm_src_hi;
582 __m128i xmm_msk_lo, xmm_msk_hi;
587 xmm_msk_lo = load_128_unaligned (pm);
589 if (is_transparent (xmm_msk_lo))
590 return _mm_setzero_si128 ();
593 s = load_128_unaligned (ps);
597 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
598 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
600 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
602 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
603 &xmm_msk_lo, &xmm_msk_hi,
604 &xmm_src_lo, &xmm_src_hi);
606 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
612 static force_inline void
613 core_combine_over_u_sse2 (uint32_t* pd,
620 __m128i xmm_dst_lo, xmm_dst_hi;
621 __m128i xmm_src_lo, xmm_src_hi;
622 __m128i xmm_alpha_lo, xmm_alpha_hi;
624 /* Align dst on a 16-byte boundary */
625 while (w && ((unsigned long)pd & 15))
628 s = combine1 (ps, pm);
630 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
639 /* I'm loading unaligned because I'm not sure about
640 * the address alignment.
642 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
644 if (is_opaque (xmm_src_hi))
646 save_128_aligned ((__m128i*)pd, xmm_src_hi);
648 else if (!is_zero (xmm_src_hi))
650 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
652 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
653 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
656 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
658 over_2x128 (&xmm_src_lo, &xmm_src_hi,
659 &xmm_alpha_lo, &xmm_alpha_hi,
660 &xmm_dst_lo, &xmm_dst_hi);
662 /* rebuid the 4 pixel data and save*/
663 save_128_aligned ((__m128i*)pd,
664 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
677 s = combine1 (ps, pm);
679 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
688 static force_inline void
689 core_combine_over_reverse_u_sse2 (uint32_t* pd,
696 __m128i xmm_dst_lo, xmm_dst_hi;
697 __m128i xmm_src_lo, xmm_src_hi;
698 __m128i xmm_alpha_lo, xmm_alpha_hi;
700 /* Align dst on a 16-byte boundary */
702 ((unsigned long)pd & 15))
705 s = combine1 (ps, pm);
707 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
716 /* I'm loading unaligned because I'm not sure
717 * about the address alignment.
719 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
720 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
722 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
723 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
725 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
726 &xmm_alpha_lo, &xmm_alpha_hi);
728 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
729 &xmm_alpha_lo, &xmm_alpha_hi,
730 &xmm_src_lo, &xmm_src_hi);
732 /* rebuid the 4 pixel data and save*/
733 save_128_aligned ((__m128i*)pd,
734 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
747 s = combine1 (ps, pm);
749 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
757 static force_inline uint32_t
758 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
760 uint32_t maska = src >> 24;
766 else if (maska != 0xff)
768 return pack_1x64_32 (
769 pix_multiply_1x64 (unpack_32_1x64 (dst),
770 expand_alpha_1x64 (unpack_32_1x64 (src))));
776 static force_inline void
777 core_combine_in_u_sse2 (uint32_t* pd,
784 __m128i xmm_src_lo, xmm_src_hi;
785 __m128i xmm_dst_lo, xmm_dst_hi;
787 while (w && ((unsigned long) pd & 15))
789 s = combine1 (ps, pm);
792 *pd++ = core_combine_in_u_pixelsse2 (d, s);
801 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
802 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
804 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
805 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
807 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
808 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
809 &xmm_dst_lo, &xmm_dst_hi,
810 &xmm_dst_lo, &xmm_dst_hi);
812 save_128_aligned ((__m128i*)pd,
813 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
824 s = combine1 (ps, pm);
827 *pd++ = core_combine_in_u_pixelsse2 (d, s);
835 static force_inline void
836 core_combine_reverse_in_u_sse2 (uint32_t* pd,
843 __m128i xmm_src_lo, xmm_src_hi;
844 __m128i xmm_dst_lo, xmm_dst_hi;
846 while (w && ((unsigned long) pd & 15))
848 s = combine1 (ps, pm);
851 *pd++ = core_combine_in_u_pixelsse2 (s, d);
860 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
861 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
863 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
866 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
867 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
868 &xmm_src_lo, &xmm_src_hi,
869 &xmm_dst_lo, &xmm_dst_hi);
872 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
883 s = combine1 (ps, pm);
886 *pd++ = core_combine_in_u_pixelsse2 (s, d);
894 static force_inline void
895 core_combine_reverse_out_u_sse2 (uint32_t* pd,
900 while (w && ((unsigned long) pd & 15))
902 uint32_t s = combine1 (ps, pm);
905 *pd++ = pack_1x64_32 (
907 unpack_32_1x64 (d), negate_1x64 (
908 expand_alpha_1x64 (unpack_32_1x64 (s)))));
918 __m128i xmm_src_lo, xmm_src_hi;
919 __m128i xmm_dst_lo, xmm_dst_hi;
921 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
922 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
924 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
925 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
927 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
928 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
930 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
931 &xmm_src_lo, &xmm_src_hi,
932 &xmm_dst_lo, &xmm_dst_hi);
935 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
947 uint32_t s = combine1 (ps, pm);
950 *pd++ = pack_1x64_32 (
952 unpack_32_1x64 (d), negate_1x64 (
953 expand_alpha_1x64 (unpack_32_1x64 (s)))));
961 static force_inline void
962 core_combine_out_u_sse2 (uint32_t* pd,
967 while (w && ((unsigned long) pd & 15))
969 uint32_t s = combine1 (ps, pm);
972 *pd++ = pack_1x64_32 (
974 unpack_32_1x64 (s), negate_1x64 (
975 expand_alpha_1x64 (unpack_32_1x64 (d)))));
984 __m128i xmm_src_lo, xmm_src_hi;
985 __m128i xmm_dst_lo, xmm_dst_hi;
987 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
988 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
990 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
991 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
993 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
994 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
996 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
997 &xmm_dst_lo, &xmm_dst_hi,
998 &xmm_dst_lo, &xmm_dst_hi);
1001 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1012 uint32_t s = combine1 (ps, pm);
1015 *pd++ = pack_1x64_32 (
1017 unpack_32_1x64 (s), negate_1x64 (
1018 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1026 static force_inline uint32_t
1027 core_combine_atop_u_pixel_sse2 (uint32_t src,
1030 __m64 s = unpack_32_1x64 (src);
1031 __m64 d = unpack_32_1x64 (dst);
1033 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1034 __m64 da = expand_alpha_1x64 (d);
1036 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1039 static force_inline void
1040 core_combine_atop_u_sse2 (uint32_t* pd,
1047 __m128i xmm_src_lo, xmm_src_hi;
1048 __m128i xmm_dst_lo, xmm_dst_hi;
1049 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1050 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1052 while (w && ((unsigned long) pd & 15))
1054 s = combine1 (ps, pm);
1057 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1066 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1067 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1069 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1070 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1072 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1073 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1074 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1075 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1077 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1078 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1080 pix_add_multiply_2x128 (
1081 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1082 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1083 &xmm_dst_lo, &xmm_dst_hi);
1086 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1097 s = combine1 (ps, pm);
1100 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1108 static force_inline uint32_t
1109 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1112 __m64 s = unpack_32_1x64 (src);
1113 __m64 d = unpack_32_1x64 (dst);
1115 __m64 sa = expand_alpha_1x64 (s);
1116 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1118 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1121 static force_inline void
1122 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1129 __m128i xmm_src_lo, xmm_src_hi;
1130 __m128i xmm_dst_lo, xmm_dst_hi;
1131 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1132 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1134 while (w && ((unsigned long) pd & 15))
1136 s = combine1 (ps, pm);
1139 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1148 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1149 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1151 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1152 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1154 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1155 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1156 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1157 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1159 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1160 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1162 pix_add_multiply_2x128 (
1163 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1164 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1165 &xmm_dst_lo, &xmm_dst_hi);
1168 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1179 s = combine1 (ps, pm);
1182 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1190 static force_inline uint32_t
1191 core_combine_xor_u_pixel_sse2 (uint32_t src,
1194 __m64 s = unpack_32_1x64 (src);
1195 __m64 d = unpack_32_1x64 (dst);
1197 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1198 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1200 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1203 static force_inline void
1204 core_combine_xor_u_sse2 (uint32_t* dst,
1205 const uint32_t* src,
1206 const uint32_t *mask,
1212 const uint32_t* ps = src;
1213 const uint32_t* pm = mask;
1215 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1216 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1217 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1218 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1220 while (w && ((unsigned long) pd & 15))
1222 s = combine1 (ps, pm);
1225 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1234 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1235 xmm_dst = load_128_aligned ((__m128i*) pd);
1237 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1238 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1240 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1241 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1242 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1243 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1245 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1246 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1247 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1248 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1250 pix_add_multiply_2x128 (
1251 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1252 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1253 &xmm_dst_lo, &xmm_dst_hi);
1256 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1267 s = combine1 (ps, pm);
1270 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1278 static force_inline void
1279 core_combine_add_u_sse2 (uint32_t* dst,
1280 const uint32_t* src,
1281 const uint32_t* mask,
1287 const uint32_t* ps = src;
1288 const uint32_t* pm = mask;
1290 while (w && (unsigned long)pd & 15)
1292 s = combine1 (ps, pm);
1298 *pd++ = _mm_cvtsi64_si32 (
1299 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1307 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1310 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1321 s = combine1 (ps, pm);
1325 *pd++ = _mm_cvtsi64_si32 (
1326 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1332 static force_inline uint32_t
1333 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1336 __m64 ms = unpack_32_1x64 (src);
1337 __m64 md = unpack_32_1x64 (dst);
1338 uint32_t sa = src >> 24;
1339 uint32_t da = ~dst >> 24;
1343 ms = pix_multiply_1x64 (
1344 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1347 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1350 static force_inline void
1351 core_combine_saturate_u_sse2 (uint32_t * pd,
1359 __m128i xmm_src, xmm_dst;
1361 while (w && (unsigned long)pd & 15)
1363 s = combine1 (ps, pm);
1366 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1375 xmm_dst = load_128_aligned ((__m128i*)pd);
1376 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1378 pack_cmp = _mm_movemask_epi8 (
1380 _mm_srli_epi32 (xmm_src, 24),
1381 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1383 /* if some alpha src is grater than respective ~alpha dst */
1386 s = combine1 (ps++, pm);
1388 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1392 s = combine1 (ps++, pm);
1394 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1398 s = combine1 (ps++, pm);
1400 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1404 s = combine1 (ps++, pm);
1406 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1412 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1425 s = combine1 (ps, pm);
1428 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1435 static force_inline void
1436 core_combine_src_ca_sse2 (uint32_t* pd,
1443 __m128i xmm_src_lo, xmm_src_hi;
1444 __m128i xmm_mask_lo, xmm_mask_hi;
1445 __m128i xmm_dst_lo, xmm_dst_hi;
1447 while (w && (unsigned long)pd & 15)
1451 *pd++ = pack_1x64_32 (
1452 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1458 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1459 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1461 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1462 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1464 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1465 &xmm_mask_lo, &xmm_mask_hi,
1466 &xmm_dst_lo, &xmm_dst_hi);
1469 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1481 *pd++ = pack_1x64_32 (
1482 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1487 static force_inline uint32_t
1488 core_combine_over_ca_pixel_sse2 (uint32_t src,
1492 __m64 s = unpack_32_1x64 (src);
1493 __m64 expAlpha = expand_alpha_1x64 (s);
1494 __m64 unpk_mask = unpack_32_1x64 (mask);
1495 __m64 unpk_dst = unpack_32_1x64 (dst);
1497 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1500 static force_inline void
1501 core_combine_over_ca_sse2 (uint32_t* pd,
1508 __m128i xmm_alpha_lo, xmm_alpha_hi;
1509 __m128i xmm_src_lo, xmm_src_hi;
1510 __m128i xmm_dst_lo, xmm_dst_hi;
1511 __m128i xmm_mask_lo, xmm_mask_hi;
1513 while (w && (unsigned long)pd & 15)
1519 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1525 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1526 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1527 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1529 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1530 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1531 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1533 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1534 &xmm_alpha_lo, &xmm_alpha_hi);
1536 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1537 &xmm_alpha_lo, &xmm_alpha_hi,
1538 &xmm_mask_lo, &xmm_mask_hi,
1539 &xmm_dst_lo, &xmm_dst_hi);
1542 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1556 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1561 static force_inline uint32_t
1562 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1566 __m64 d = unpack_32_1x64 (dst);
1568 return pack_1x64_32 (
1569 over_1x64 (d, expand_alpha_1x64 (d),
1570 pix_multiply_1x64 (unpack_32_1x64 (src),
1571 unpack_32_1x64 (mask))));
1574 static force_inline void
1575 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1582 __m128i xmm_alpha_lo, xmm_alpha_hi;
1583 __m128i xmm_src_lo, xmm_src_hi;
1584 __m128i xmm_dst_lo, xmm_dst_hi;
1585 __m128i xmm_mask_lo, xmm_mask_hi;
1587 while (w && (unsigned long)pd & 15)
1593 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1599 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1600 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1601 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1603 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1604 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1605 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1607 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1608 &xmm_alpha_lo, &xmm_alpha_hi);
1609 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1610 &xmm_mask_lo, &xmm_mask_hi,
1611 &xmm_mask_lo, &xmm_mask_hi);
1613 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1614 &xmm_alpha_lo, &xmm_alpha_hi,
1615 &xmm_mask_lo, &xmm_mask_hi);
1618 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1632 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1637 static force_inline void
1638 core_combine_in_ca_sse2 (uint32_t * pd,
1645 __m128i xmm_alpha_lo, xmm_alpha_hi;
1646 __m128i xmm_src_lo, xmm_src_hi;
1647 __m128i xmm_dst_lo, xmm_dst_hi;
1648 __m128i xmm_mask_lo, xmm_mask_hi;
1650 while (w && (unsigned long)pd & 15)
1656 *pd++ = pack_1x64_32 (
1658 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1659 expand_alpha_1x64 (unpack_32_1x64 (d))));
1666 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1667 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1668 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1670 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1671 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1672 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1674 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1675 &xmm_alpha_lo, &xmm_alpha_hi);
1677 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1678 &xmm_mask_lo, &xmm_mask_hi,
1679 &xmm_dst_lo, &xmm_dst_hi);
1681 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1682 &xmm_alpha_lo, &xmm_alpha_hi,
1683 &xmm_dst_lo, &xmm_dst_hi);
1686 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1700 *pd++ = pack_1x64_32 (
1703 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1704 expand_alpha_1x64 (unpack_32_1x64 (d))));
1710 static force_inline void
1711 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1718 __m128i xmm_alpha_lo, xmm_alpha_hi;
1719 __m128i xmm_src_lo, xmm_src_hi;
1720 __m128i xmm_dst_lo, xmm_dst_hi;
1721 __m128i xmm_mask_lo, xmm_mask_hi;
1723 while (w && (unsigned long)pd & 15)
1729 *pd++ = pack_1x64_32 (
1732 pix_multiply_1x64 (unpack_32_1x64 (m),
1733 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1739 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1740 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1741 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1743 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1744 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1745 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1747 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1748 &xmm_alpha_lo, &xmm_alpha_hi);
1749 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1750 &xmm_alpha_lo, &xmm_alpha_hi,
1751 &xmm_alpha_lo, &xmm_alpha_hi);
1753 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1754 &xmm_alpha_lo, &xmm_alpha_hi,
1755 &xmm_dst_lo, &xmm_dst_hi);
1758 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1772 *pd++ = pack_1x64_32 (
1775 pix_multiply_1x64 (unpack_32_1x64 (m),
1776 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1781 static force_inline void
1782 core_combine_out_ca_sse2 (uint32_t * pd,
1789 __m128i xmm_alpha_lo, xmm_alpha_hi;
1790 __m128i xmm_src_lo, xmm_src_hi;
1791 __m128i xmm_dst_lo, xmm_dst_hi;
1792 __m128i xmm_mask_lo, xmm_mask_hi;
1794 while (w && (unsigned long)pd & 15)
1800 *pd++ = pack_1x64_32 (
1803 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1804 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1810 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1811 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1812 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1814 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1815 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1816 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1818 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1819 &xmm_alpha_lo, &xmm_alpha_hi);
1820 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1821 &xmm_alpha_lo, &xmm_alpha_hi);
1823 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1824 &xmm_mask_lo, &xmm_mask_hi,
1825 &xmm_dst_lo, &xmm_dst_hi);
1826 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1827 &xmm_alpha_lo, &xmm_alpha_hi,
1828 &xmm_dst_lo, &xmm_dst_hi);
1831 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1845 *pd++ = pack_1x64_32 (
1848 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1849 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1855 static force_inline void
1856 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
1863 __m128i xmm_alpha_lo, xmm_alpha_hi;
1864 __m128i xmm_src_lo, xmm_src_hi;
1865 __m128i xmm_dst_lo, xmm_dst_hi;
1866 __m128i xmm_mask_lo, xmm_mask_hi;
1868 while (w && (unsigned long)pd & 15)
1874 *pd++ = pack_1x64_32 (
1877 negate_1x64 (pix_multiply_1x64 (
1879 expand_alpha_1x64 (unpack_32_1x64 (s))))));
1885 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1886 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1887 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1889 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1890 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1891 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1893 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1894 &xmm_alpha_lo, &xmm_alpha_hi);
1896 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1897 &xmm_alpha_lo, &xmm_alpha_hi,
1898 &xmm_mask_lo, &xmm_mask_hi);
1900 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1901 &xmm_mask_lo, &xmm_mask_hi);
1903 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1904 &xmm_mask_lo, &xmm_mask_hi,
1905 &xmm_dst_lo, &xmm_dst_hi);
1908 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1922 *pd++ = pack_1x64_32 (
1925 negate_1x64 (pix_multiply_1x64 (
1927 expand_alpha_1x64 (unpack_32_1x64 (s))))));
1932 static force_inline uint32_t
1933 core_combine_atop_ca_pixel_sse2 (uint32_t src,
1937 __m64 m = unpack_32_1x64 (mask);
1938 __m64 s = unpack_32_1x64 (src);
1939 __m64 d = unpack_32_1x64 (dst);
1940 __m64 sa = expand_alpha_1x64 (s);
1941 __m64 da = expand_alpha_1x64 (d);
1943 s = pix_multiply_1x64 (s, m);
1944 m = negate_1x64 (pix_multiply_1x64 (m, sa));
1946 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
1949 static force_inline void
1950 core_combine_atop_ca_sse2 (uint32_t * pd,
1957 __m128i xmm_src_lo, xmm_src_hi;
1958 __m128i xmm_dst_lo, xmm_dst_hi;
1959 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1960 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1961 __m128i xmm_mask_lo, xmm_mask_hi;
1963 while (w && (unsigned long)pd & 15)
1969 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
1975 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1976 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1977 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1979 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1980 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1981 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1983 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1984 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1985 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1986 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1988 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1989 &xmm_mask_lo, &xmm_mask_hi,
1990 &xmm_src_lo, &xmm_src_hi);
1991 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1992 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1993 &xmm_mask_lo, &xmm_mask_hi);
1995 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1997 pix_add_multiply_2x128 (
1998 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
1999 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2000 &xmm_dst_lo, &xmm_dst_hi);
2003 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2017 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2022 static force_inline uint32_t
2023 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2027 __m64 m = unpack_32_1x64 (mask);
2028 __m64 s = unpack_32_1x64 (src);
2029 __m64 d = unpack_32_1x64 (dst);
2031 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2032 __m64 sa = expand_alpha_1x64 (s);
2034 s = pix_multiply_1x64 (s, m);
2035 m = pix_multiply_1x64 (m, sa);
2037 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2040 static force_inline void
2041 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2048 __m128i xmm_src_lo, xmm_src_hi;
2049 __m128i xmm_dst_lo, xmm_dst_hi;
2050 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2051 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2052 __m128i xmm_mask_lo, xmm_mask_hi;
2054 while (w && (unsigned long)pd & 15)
2060 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2066 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2067 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2068 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2070 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2071 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2072 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2074 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2075 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2076 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2077 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2079 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2080 &xmm_mask_lo, &xmm_mask_hi,
2081 &xmm_src_lo, &xmm_src_hi);
2082 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2083 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2084 &xmm_mask_lo, &xmm_mask_hi);
2086 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2087 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2089 pix_add_multiply_2x128 (
2090 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092 &xmm_dst_lo, &xmm_dst_hi);
2095 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2109 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2114 static force_inline uint32_t
2115 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2119 __m64 a = unpack_32_1x64 (mask);
2120 __m64 s = unpack_32_1x64 (src);
2121 __m64 d = unpack_32_1x64 (dst);
2123 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2124 a, expand_alpha_1x64 (s)));
2125 __m64 dest = pix_multiply_1x64 (s, a);
2126 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2128 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2134 static force_inline void
2135 core_combine_xor_ca_sse2 (uint32_t * pd,
2142 __m128i xmm_src_lo, xmm_src_hi;
2143 __m128i xmm_dst_lo, xmm_dst_hi;
2144 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2145 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2146 __m128i xmm_mask_lo, xmm_mask_hi;
2148 while (w && (unsigned long)pd & 15)
2154 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2160 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2164 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2168 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2170 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2171 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2173 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2174 &xmm_mask_lo, &xmm_mask_hi,
2175 &xmm_src_lo, &xmm_src_hi);
2176 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2177 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2178 &xmm_mask_lo, &xmm_mask_hi);
2180 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2181 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2183 &xmm_mask_lo, &xmm_mask_hi);
2185 pix_add_multiply_2x128 (
2186 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2187 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2188 &xmm_dst_lo, &xmm_dst_hi);
2191 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2205 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2210 static force_inline void
2211 core_combine_add_ca_sse2 (uint32_t * pd,
2218 __m128i xmm_src_lo, xmm_src_hi;
2219 __m128i xmm_dst_lo, xmm_dst_hi;
2220 __m128i xmm_mask_lo, xmm_mask_hi;
2222 while (w && (unsigned long)pd & 15)
2228 *pd++ = pack_1x64_32 (
2229 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2230 unpack_32_1x64 (m)),
2231 unpack_32_1x64 (d)));
2237 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2238 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2239 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2241 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2242 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2243 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2245 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2246 &xmm_mask_lo, &xmm_mask_hi,
2247 &xmm_src_lo, &xmm_src_hi);
2250 (__m128i*)pd, pack_2x128_128 (
2251 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2252 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2266 *pd++ = pack_1x64_32 (
2267 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2268 unpack_32_1x64 (m)),
2269 unpack_32_1x64 (d)));
2274 /* ---------------------------------------------------
2275 * fb_compose_setup_sSE2
2277 static force_inline __m64
2278 create_mask_16_64 (uint16_t mask)
2280 return _mm_set1_pi16 (mask);
2283 static force_inline __m128i
2284 create_mask_16_128 (uint16_t mask)
2286 return _mm_set1_epi16 (mask);
2289 static force_inline __m64
2290 create_mask_2x32_64 (uint32_t mask0,
2293 return _mm_set_pi32 (mask0, mask1);
2296 /* Work around a code generation bug in Sun Studio 12. */
2297 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2298 # define create_mask_2x32_128(mask0, mask1) \
2299 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2301 static force_inline __m128i
2302 create_mask_2x32_128 (uint32_t mask0,
2305 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2309 /* SSE2 code patch for fbcompose.c */
2312 sse2_combine_over_u (pixman_implementation_t *imp,
2315 const uint32_t * src,
2316 const uint32_t * mask,
2319 core_combine_over_u_sse2 (dst, src, mask, width);
2324 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2327 const uint32_t * src,
2328 const uint32_t * mask,
2331 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2336 sse2_combine_in_u (pixman_implementation_t *imp,
2339 const uint32_t * src,
2340 const uint32_t * mask,
2343 core_combine_in_u_sse2 (dst, src, mask, width);
2348 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2351 const uint32_t * src,
2352 const uint32_t * mask,
2355 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2360 sse2_combine_out_u (pixman_implementation_t *imp,
2363 const uint32_t * src,
2364 const uint32_t * mask,
2367 core_combine_out_u_sse2 (dst, src, mask, width);
2372 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2375 const uint32_t * src,
2376 const uint32_t * mask,
2379 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2384 sse2_combine_atop_u (pixman_implementation_t *imp,
2387 const uint32_t * src,
2388 const uint32_t * mask,
2391 core_combine_atop_u_sse2 (dst, src, mask, width);
2396 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2399 const uint32_t * src,
2400 const uint32_t * mask,
2403 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2408 sse2_combine_xor_u (pixman_implementation_t *imp,
2411 const uint32_t * src,
2412 const uint32_t * mask,
2415 core_combine_xor_u_sse2 (dst, src, mask, width);
2420 sse2_combine_add_u (pixman_implementation_t *imp,
2423 const uint32_t * src,
2424 const uint32_t * mask,
2427 core_combine_add_u_sse2 (dst, src, mask, width);
2432 sse2_combine_saturate_u (pixman_implementation_t *imp,
2435 const uint32_t * src,
2436 const uint32_t * mask,
2439 core_combine_saturate_u_sse2 (dst, src, mask, width);
2444 sse2_combine_src_ca (pixman_implementation_t *imp,
2447 const uint32_t * src,
2448 const uint32_t * mask,
2451 core_combine_src_ca_sse2 (dst, src, mask, width);
2456 sse2_combine_over_ca (pixman_implementation_t *imp,
2459 const uint32_t * src,
2460 const uint32_t * mask,
2463 core_combine_over_ca_sse2 (dst, src, mask, width);
2468 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2471 const uint32_t * src,
2472 const uint32_t * mask,
2475 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2480 sse2_combine_in_ca (pixman_implementation_t *imp,
2483 const uint32_t * src,
2484 const uint32_t * mask,
2487 core_combine_in_ca_sse2 (dst, src, mask, width);
2492 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2495 const uint32_t * src,
2496 const uint32_t * mask,
2499 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2504 sse2_combine_out_ca (pixman_implementation_t *imp,
2507 const uint32_t * src,
2508 const uint32_t * mask,
2511 core_combine_out_ca_sse2 (dst, src, mask, width);
2516 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2519 const uint32_t * src,
2520 const uint32_t * mask,
2523 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2528 sse2_combine_atop_ca (pixman_implementation_t *imp,
2531 const uint32_t * src,
2532 const uint32_t * mask,
2535 core_combine_atop_ca_sse2 (dst, src, mask, width);
2540 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2543 const uint32_t * src,
2544 const uint32_t * mask,
2547 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2552 sse2_combine_xor_ca (pixman_implementation_t *imp,
2555 const uint32_t * src,
2556 const uint32_t * mask,
2559 core_combine_xor_ca_sse2 (dst, src, mask, width);
2564 sse2_combine_add_ca (pixman_implementation_t *imp,
2567 const uint32_t * src,
2568 const uint32_t * mask,
2571 core_combine_add_ca_sse2 (dst, src, mask, width);
2575 /* -------------------------------------------------------------------
2576 * composite_over_n_8888
2580 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2582 pixman_image_t * src_image,
2583 pixman_image_t * mask_image,
2584 pixman_image_t * dst_image,
2595 uint32_t *dst_line, *dst, d;
2598 __m128i xmm_src, xmm_alpha;
2599 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2601 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2606 PIXMAN_IMAGE_GET_LINE (
2607 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2609 xmm_src = expand_pixel_32_1x128 (src);
2610 xmm_alpha = expand_alpha_1x128 (xmm_src);
2616 dst_line += dst_stride;
2619 while (w && (unsigned long)dst & 15)
2622 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2623 _mm_movepi64_pi64 (xmm_alpha),
2624 unpack_32_1x64 (d)));
2630 xmm_dst = load_128_aligned ((__m128i*)dst);
2632 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2634 over_2x128 (&xmm_src, &xmm_src,
2635 &xmm_alpha, &xmm_alpha,
2636 &xmm_dst_lo, &xmm_dst_hi);
2638 /* rebuid the 4 pixel data and save*/
2640 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2649 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2650 _mm_movepi64_pi64 (xmm_alpha),
2651 unpack_32_1x64 (d)));
2659 /* ---------------------------------------------------------------------
2660 * composite_over_n_0565
2663 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2665 pixman_image_t * src_image,
2666 pixman_image_t * mask_image,
2667 pixman_image_t * dst_image,
2678 uint16_t *dst_line, *dst, d;
2681 __m128i xmm_src, xmm_alpha;
2682 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2684 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2689 PIXMAN_IMAGE_GET_LINE (
2690 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2692 xmm_src = expand_pixel_32_1x128 (src);
2693 xmm_alpha = expand_alpha_1x128 (xmm_src);
2699 dst_line += dst_stride;
2702 while (w && (unsigned long)dst & 15)
2706 *dst++ = pack_565_32_16 (
2707 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2708 _mm_movepi64_pi64 (xmm_alpha),
2709 expand565_16_1x64 (d))));
2715 xmm_dst = load_128_aligned ((__m128i*)dst);
2717 unpack_565_128_4x128 (xmm_dst,
2718 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2720 over_2x128 (&xmm_src, &xmm_src,
2721 &xmm_alpha, &xmm_alpha,
2722 &xmm_dst0, &xmm_dst1);
2723 over_2x128 (&xmm_src, &xmm_src,
2724 &xmm_alpha, &xmm_alpha,
2725 &xmm_dst2, &xmm_dst3);
2727 xmm_dst = pack_565_4x128_128 (
2728 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2730 save_128_aligned ((__m128i*)dst, xmm_dst);
2739 *dst++ = pack_565_32_16 (
2740 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2741 _mm_movepi64_pi64 (xmm_alpha),
2742 expand565_16_1x64 (d))));
2749 /* ------------------------------
2750 * composite_add_n_8888_8888_ca
2753 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2755 pixman_image_t * src_image,
2756 pixman_image_t * mask_image,
2757 pixman_image_t * dst_image,
2768 uint32_t *dst_line, d;
2769 uint32_t *mask_line, m;
2771 int dst_stride, mask_stride;
2773 __m128i xmm_src, xmm_alpha;
2775 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2777 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2779 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2785 PIXMAN_IMAGE_GET_LINE (
2786 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2787 PIXMAN_IMAGE_GET_LINE (
2788 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2790 xmm_src = _mm_unpacklo_epi8 (
2791 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2792 xmm_alpha = expand_alpha_1x128 (xmm_src);
2793 mmx_src = _mm_movepi64_pi64 (xmm_src);
2794 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2799 const uint32_t *pm = (uint32_t *)mask_line;
2800 uint32_t *pd = (uint32_t *)dst_line;
2802 dst_line += dst_stride;
2803 mask_line += mask_stride;
2805 while (w && (unsigned long)pd & 15)
2813 mmx_mask = unpack_32_1x64 (m);
2814 mmx_dest = unpack_32_1x64 (d);
2816 *pd = pack_1x64_32 (
2817 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2826 xmm_mask = load_128_unaligned ((__m128i*)pm);
2830 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2832 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2833 if (pack_cmp != 0xffff)
2835 xmm_dst = load_128_aligned ((__m128i*)pd);
2837 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2839 pix_multiply_2x128 (&xmm_src, &xmm_src,
2840 &xmm_mask_lo, &xmm_mask_hi,
2841 &xmm_mask_lo, &xmm_mask_hi);
2842 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2845 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2861 mmx_mask = unpack_32_1x64 (m);
2862 mmx_dest = unpack_32_1x64 (d);
2864 *pd = pack_1x64_32 (
2865 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
2876 /* ---------------------------------------------------------------------------
2877 * composite_over_n_8888_8888_ca
2881 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2883 pixman_image_t * src_image,
2884 pixman_image_t * mask_image,
2885 pixman_image_t * dst_image,
2896 uint32_t *dst_line, d;
2897 uint32_t *mask_line, m;
2899 int dst_stride, mask_stride;
2901 __m128i xmm_src, xmm_alpha;
2902 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2903 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2905 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2907 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2912 PIXMAN_IMAGE_GET_LINE (
2913 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2914 PIXMAN_IMAGE_GET_LINE (
2915 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2917 xmm_src = _mm_unpacklo_epi8 (
2918 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2919 xmm_alpha = expand_alpha_1x128 (xmm_src);
2920 mmx_src = _mm_movepi64_pi64 (xmm_src);
2921 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2926 const uint32_t *pm = (uint32_t *)mask_line;
2927 uint32_t *pd = (uint32_t *)dst_line;
2929 dst_line += dst_stride;
2930 mask_line += mask_stride;
2932 while (w && (unsigned long)pd & 15)
2939 mmx_mask = unpack_32_1x64 (m);
2940 mmx_dest = unpack_32_1x64 (d);
2942 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
2954 xmm_mask = load_128_unaligned ((__m128i*)pm);
2958 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2960 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2961 if (pack_cmp != 0xffff)
2963 xmm_dst = load_128_aligned ((__m128i*)pd);
2965 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2966 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2968 in_over_2x128 (&xmm_src, &xmm_src,
2969 &xmm_alpha, &xmm_alpha,
2970 &xmm_mask_lo, &xmm_mask_hi,
2971 &xmm_dst_lo, &xmm_dst_hi);
2974 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2989 mmx_mask = unpack_32_1x64 (m);
2990 mmx_dest = unpack_32_1x64 (d);
2992 *pd = pack_1x64_32 (
2993 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3004 /*---------------------------------------------------------------------
3005 * composite_over_8888_n_8888
3009 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3011 pixman_image_t * src_image,
3012 pixman_image_t * mask_image,
3013 pixman_image_t * dst_image,
3023 uint32_t *dst_line, *dst;
3024 uint32_t *src_line, *src;
3027 int dst_stride, src_stride;
3030 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3031 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3032 __m128i xmm_alpha_lo, xmm_alpha_hi;
3034 PIXMAN_IMAGE_GET_LINE (
3035 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3036 PIXMAN_IMAGE_GET_LINE (
3037 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3039 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3041 xmm_mask = create_mask_16_128 (mask >> 24);
3046 dst_line += dst_stride;
3048 src_line += src_stride;
3051 while (w && (unsigned long)dst & 15)
3053 uint32_t s = *src++;
3059 __m64 ms = unpack_32_1x64 (s);
3060 __m64 alpha = expand_alpha_1x64 (ms);
3061 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3062 __m64 alpha_dst = unpack_32_1x64 (d);
3064 *dst = pack_1x64_32 (
3065 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3073 xmm_src = load_128_unaligned ((__m128i*)src);
3075 if (!is_zero (xmm_src))
3077 xmm_dst = load_128_aligned ((__m128i*)dst);
3079 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3080 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3081 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3082 &xmm_alpha_lo, &xmm_alpha_hi);
3084 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3085 &xmm_alpha_lo, &xmm_alpha_hi,
3086 &xmm_mask, &xmm_mask,
3087 &xmm_dst_lo, &xmm_dst_hi);
3090 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3100 uint32_t s = *src++;
3106 __m64 ms = unpack_32_1x64 (s);
3107 __m64 alpha = expand_alpha_1x64 (ms);
3108 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3109 __m64 dest = unpack_32_1x64 (d);
3111 *dst = pack_1x64_32 (
3112 in_over_1x64 (&ms, &alpha, &mask, &dest));
3123 /*---------------------------------------------------------------------
3124 * composite_over_8888_n_8888
3128 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
3130 pixman_image_t * src_image,
3131 pixman_image_t * mask_image,
3132 pixman_image_t * dst_image,
3142 uint32_t *dst_line, *dst;
3143 uint32_t *src_line, *src;
3145 int dst_stride, src_stride;
3148 PIXMAN_IMAGE_GET_LINE (
3149 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3150 PIXMAN_IMAGE_GET_LINE (
3151 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3156 dst_line += dst_stride;
3158 src_line += src_stride;
3161 while (w && (unsigned long)dst & 15)
3163 *dst++ = *src++ | 0xff000000;
3169 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
3171 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
3172 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
3173 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
3174 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
3176 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
3177 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
3178 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
3179 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
3188 *dst++ = *src++ | 0xff000000;
3196 /* ---------------------------------------------------------------------
3197 * composite_over_x888_n_8888
3200 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3202 pixman_image_t * src_image,
3203 pixman_image_t * mask_image,
3204 pixman_image_t * dst_image,
3214 uint32_t *dst_line, *dst;
3215 uint32_t *src_line, *src;
3217 int dst_stride, src_stride;
3220 __m128i xmm_mask, xmm_alpha;
3221 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3222 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3224 PIXMAN_IMAGE_GET_LINE (
3225 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3226 PIXMAN_IMAGE_GET_LINE (
3227 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3229 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3231 xmm_mask = create_mask_16_128 (mask >> 24);
3232 xmm_alpha = mask_00ff;
3237 dst_line += dst_stride;
3239 src_line += src_stride;
3242 while (w && (unsigned long)dst & 15)
3244 uint32_t s = (*src++) | 0xff000000;
3247 __m64 src = unpack_32_1x64 (s);
3248 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3249 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3250 __m64 dest = unpack_32_1x64 (d);
3252 *dst++ = pack_1x64_32 (
3253 in_over_1x64 (&src, &alpha, &mask, &dest));
3260 xmm_src = _mm_or_si128 (
3261 load_128_unaligned ((__m128i*)src), mask_ff000000);
3262 xmm_dst = load_128_aligned ((__m128i*)dst);
3264 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3265 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3267 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3268 &xmm_alpha, &xmm_alpha,
3269 &xmm_mask, &xmm_mask,
3270 &xmm_dst_lo, &xmm_dst_hi);
3273 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3283 uint32_t s = (*src++) | 0xff000000;
3286 __m64 src = unpack_32_1x64 (s);
3287 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3288 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3289 __m64 dest = unpack_32_1x64 (d);
3291 *dst++ = pack_1x64_32 (
3292 in_over_1x64 (&src, &alpha, &mask, &dest));
3301 /* --------------------------------------------------------------------
3302 * composite_over_8888_8888
3305 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3307 pixman_image_t * src_image,
3308 pixman_image_t * mask_image,
3309 pixman_image_t * dst_image,
3319 int dst_stride, src_stride;
3320 uint32_t *dst_line, *dst;
3321 uint32_t *src_line, *src;
3323 PIXMAN_IMAGE_GET_LINE (
3324 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3325 PIXMAN_IMAGE_GET_LINE (
3326 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3333 core_combine_over_u_sse2 (dst, src, NULL, width);
3341 /* ------------------------------------------------------------------
3342 * composite_over_8888_0565
3344 static force_inline uint16_t
3345 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3349 ms = unpack_32_1x64 (src);
3350 return pack_565_32_16 (
3353 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3357 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3359 pixman_image_t * src_image,
3360 pixman_image_t * mask_image,
3361 pixman_image_t * dst_image,
3371 uint16_t *dst_line, *dst, d;
3372 uint32_t *src_line, *src, s;
3373 int dst_stride, src_stride;
3376 __m128i xmm_alpha_lo, xmm_alpha_hi;
3377 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3378 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3380 PIXMAN_IMAGE_GET_LINE (
3381 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3382 PIXMAN_IMAGE_GET_LINE (
3383 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3388 * I copy the code from MMX one and keep the fixme.
3389 * If it's a problem there, probably is a problem here.
3391 assert (src_image->drawable == mask_image->drawable);
3399 dst_line += dst_stride;
3400 src_line += src_stride;
3403 /* Align dst on a 16-byte boundary */
3405 ((unsigned long)dst & 15))
3410 *dst++ = composite_over_8888_0565pixel (s, d);
3414 /* It's a 8 pixel loop */
3417 /* I'm loading unaligned because I'm not sure
3418 * about the address alignment.
3420 xmm_src = load_128_unaligned ((__m128i*) src);
3421 xmm_dst = load_128_aligned ((__m128i*) dst);
3424 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3425 unpack_565_128_4x128 (xmm_dst,
3426 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3427 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3428 &xmm_alpha_lo, &xmm_alpha_hi);
3430 /* I'm loading next 4 pixels from memory
3431 * before to optimze the memory read.
3433 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3435 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3436 &xmm_alpha_lo, &xmm_alpha_hi,
3437 &xmm_dst0, &xmm_dst1);
3440 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3441 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3442 &xmm_alpha_lo, &xmm_alpha_hi);
3444 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3445 &xmm_alpha_lo, &xmm_alpha_hi,
3446 &xmm_dst2, &xmm_dst3);
3449 (__m128i*)dst, pack_565_4x128_128 (
3450 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3462 *dst++ = composite_over_8888_0565pixel (s, d);
3469 /* -----------------------------------------------------------------
3470 * composite_over_n_8_8888
3474 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3476 pixman_image_t * src_image,
3477 pixman_image_t * mask_image,
3478 pixman_image_t * dst_image,
3489 uint32_t *dst_line, *dst;
3490 uint8_t *mask_line, *mask;
3491 int dst_stride, mask_stride;
3495 __m128i xmm_src, xmm_alpha, xmm_def;
3496 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3497 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3499 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3501 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3507 PIXMAN_IMAGE_GET_LINE (
3508 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3509 PIXMAN_IMAGE_GET_LINE (
3510 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3512 xmm_def = create_mask_2x32_128 (src, src);
3513 xmm_src = expand_pixel_32_1x128 (src);
3514 xmm_alpha = expand_alpha_1x128 (xmm_src);
3515 mmx_src = _mm_movepi64_pi64 (xmm_src);
3516 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3521 dst_line += dst_stride;
3523 mask_line += mask_stride;
3526 while (w && (unsigned long)dst & 15)
3528 uint8_t m = *mask++;
3533 mmx_mask = expand_pixel_8_1x64 (m);
3534 mmx_dest = unpack_32_1x64 (d);
3536 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3548 m = *((uint32_t*)mask);
3550 if (srca == 0xff && m == 0xffffffff)
3552 save_128_aligned ((__m128i*)dst, xmm_def);
3556 xmm_dst = load_128_aligned ((__m128i*) dst);
3557 xmm_mask = unpack_32_1x128 (m);
3558 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3561 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3562 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3564 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3565 &xmm_mask_lo, &xmm_mask_hi);
3567 in_over_2x128 (&xmm_src, &xmm_src,
3568 &xmm_alpha, &xmm_alpha,
3569 &xmm_mask_lo, &xmm_mask_hi,
3570 &xmm_dst_lo, &xmm_dst_hi);
3573 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3583 uint8_t m = *mask++;
3588 mmx_mask = expand_pixel_8_1x64 (m);
3589 mmx_dest = unpack_32_1x64 (d);
3591 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3605 /* ----------------------------------------------------------------
3606 * composite_over_n_8_8888
3610 pixman_fill_sse2 (uint32_t *bits,
3619 uint32_t byte_width;
3629 stride = stride * (int) sizeof (uint32_t) / 1;
3630 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3636 data = (w << 16) | w;
3640 stride = stride * (int) sizeof (uint32_t) / 2;
3641 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3642 byte_width = 2 * width;
3645 data = (data & 0xffff) * 0x00010001;
3649 stride = stride * (int) sizeof (uint32_t) / 4;
3650 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3651 byte_width = 4 * width;
3659 xmm_def = create_mask_2x32_128 (data, data);
3664 uint8_t *d = byte_line;
3665 byte_line += stride;
3668 while (w >= 1 && ((unsigned long)d & 1))
3670 *(uint8_t *)d = data;
3675 while (w >= 2 && ((unsigned long)d & 3))
3677 *(uint16_t *)d = data;
3682 while (w >= 4 && ((unsigned long)d & 15))
3684 *(uint32_t *)d = data;
3692 save_128_aligned ((__m128i*)(d), xmm_def);
3693 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3694 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3695 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3696 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3697 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3698 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3699 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3707 save_128_aligned ((__m128i*)(d), xmm_def);
3708 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3709 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3710 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3718 save_128_aligned ((__m128i*)(d), xmm_def);
3719 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3727 save_128_aligned ((__m128i*)(d), xmm_def);
3735 *(uint32_t *)d = data;
3743 *(uint16_t *)d = data;
3750 *(uint8_t *)d = data;
3761 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3763 pixman_image_t * src_image,
3764 pixman_image_t * mask_image,
3765 pixman_image_t * dst_image,
3776 uint32_t *dst_line, *dst;
3777 uint8_t *mask_line, *mask;
3778 int dst_stride, mask_stride;
3782 __m128i xmm_src, xmm_def;
3783 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3785 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3790 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3791 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3792 dest_x, dest_y, width, height, 0);
3796 PIXMAN_IMAGE_GET_LINE (
3797 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3798 PIXMAN_IMAGE_GET_LINE (
3799 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3801 xmm_def = create_mask_2x32_128 (src, src);
3802 xmm_src = expand_pixel_32_1x128 (src);
3807 dst_line += dst_stride;
3809 mask_line += mask_stride;
3812 while (w && (unsigned long)dst & 15)
3814 uint8_t m = *mask++;
3818 *dst = pack_1x64_32 (
3820 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3833 m = *((uint32_t*)mask);
3835 if (srca == 0xff && m == 0xffffffff)
3837 save_128_aligned ((__m128i*)dst, xmm_def);
3841 xmm_mask = unpack_32_1x128 (m);
3842 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3845 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3847 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3848 &xmm_mask_lo, &xmm_mask_hi);
3850 pix_multiply_2x128 (&xmm_src, &xmm_src,
3851 &xmm_mask_lo, &xmm_mask_hi,
3852 &xmm_mask_lo, &xmm_mask_hi);
3855 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3859 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3869 uint8_t m = *mask++;
3873 *dst = pack_1x64_32 (
3875 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3890 /*-----------------------------------------------------------------------
3891 * composite_over_n_8_0565
3895 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3897 pixman_image_t * src_image,
3898 pixman_image_t * mask_image,
3899 pixman_image_t * dst_image,
3910 uint16_t *dst_line, *dst, d;
3911 uint8_t *mask_line, *mask;
3912 int dst_stride, mask_stride;
3915 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3917 __m128i xmm_src, xmm_alpha;
3918 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3919 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3921 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3927 PIXMAN_IMAGE_GET_LINE (
3928 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3929 PIXMAN_IMAGE_GET_LINE (
3930 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3932 xmm_src = expand_pixel_32_1x128 (src);
3933 xmm_alpha = expand_alpha_1x128 (xmm_src);
3934 mmx_src = _mm_movepi64_pi64 (xmm_src);
3935 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3940 dst_line += dst_stride;
3942 mask_line += mask_stride;
3945 while (w && (unsigned long)dst & 15)
3952 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
3953 mmx_dest = expand565_16_1x64 (d);
3955 *dst = pack_565_32_16 (
3958 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3967 xmm_dst = load_128_aligned ((__m128i*) dst);
3968 unpack_565_128_4x128 (xmm_dst,
3969 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3971 m = *((uint32_t*)mask);
3976 xmm_mask = unpack_32_1x128 (m);
3977 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3980 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3982 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3983 &xmm_mask_lo, &xmm_mask_hi);
3985 in_over_2x128 (&xmm_src, &xmm_src,
3986 &xmm_alpha, &xmm_alpha,
3987 &xmm_mask_lo, &xmm_mask_hi,
3988 &xmm_dst0, &xmm_dst1);
3991 m = *((uint32_t*)mask);
3996 xmm_mask = unpack_32_1x128 (m);
3997 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4000 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4002 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4003 &xmm_mask_lo, &xmm_mask_hi);
4004 in_over_2x128 (&xmm_src, &xmm_src,
4005 &xmm_alpha, &xmm_alpha,
4006 &xmm_mask_lo, &xmm_mask_hi,
4007 &xmm_dst2, &xmm_dst3);
4011 (__m128i*)dst, pack_565_4x128_128 (
4012 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4025 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4026 mmx_dest = expand565_16_1x64 (d);
4028 *dst = pack_565_32_16 (
4031 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4042 /* -----------------------------------------------------------------------
4043 * composite_over_pixbuf_0565
4047 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4049 pixman_image_t * src_image,
4050 pixman_image_t * mask_image,
4051 pixman_image_t * dst_image,
4061 uint16_t *dst_line, *dst, d;
4062 uint32_t *src_line, *src, s;
4063 int dst_stride, src_stride;
4065 uint32_t opaque, zero;
4068 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4069 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4071 PIXMAN_IMAGE_GET_LINE (
4072 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4073 PIXMAN_IMAGE_GET_LINE (
4074 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4079 * I copy the code from MMX one and keep the fixme.
4080 * If it's a problem there, probably is a problem here.
4082 assert (src_image->drawable == mask_image->drawable);
4088 dst_line += dst_stride;
4090 src_line += src_stride;
4093 while (w && (unsigned long)dst & 15)
4098 ms = unpack_32_1x64 (s);
4100 *dst++ = pack_565_32_16 (
4102 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4109 xmm_src = load_128_unaligned ((__m128i*)src);
4110 xmm_dst = load_128_aligned ((__m128i*)dst);
4112 opaque = is_opaque (xmm_src);
4113 zero = is_zero (xmm_src);
4115 unpack_565_128_4x128 (xmm_dst,
4116 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4117 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4119 /* preload next round*/
4120 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4124 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4125 &xmm_dst0, &xmm_dst1);
4129 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4130 &xmm_dst0, &xmm_dst1);
4134 opaque = is_opaque (xmm_src);
4135 zero = is_zero (xmm_src);
4137 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4141 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4142 &xmm_dst2, &xmm_dst3);
4146 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4147 &xmm_dst2, &xmm_dst3);
4151 (__m128i*)dst, pack_565_4x128_128 (
4152 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4164 ms = unpack_32_1x64 (s);
4166 *dst++ = pack_565_32_16 (
4168 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4176 /* -------------------------------------------------------------------------
4177 * composite_over_pixbuf_8888
4181 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4183 pixman_image_t * src_image,
4184 pixman_image_t * mask_image,
4185 pixman_image_t * dst_image,
4195 uint32_t *dst_line, *dst, d;
4196 uint32_t *src_line, *src, s;
4197 int dst_stride, src_stride;
4199 uint32_t opaque, zero;
4201 __m128i xmm_src_lo, xmm_src_hi;
4202 __m128i xmm_dst_lo, xmm_dst_hi;
4204 PIXMAN_IMAGE_GET_LINE (
4205 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4206 PIXMAN_IMAGE_GET_LINE (
4207 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4212 * I copy the code from MMX one and keep the fixme.
4213 * If it's a problem there, probably is a problem here.
4215 assert (src_image->drawable == mask_image->drawable);
4221 dst_line += dst_stride;
4223 src_line += src_stride;
4226 while (w && (unsigned long)dst & 15)
4231 *dst++ = pack_1x64_32 (
4232 over_rev_non_pre_1x64 (
4233 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4240 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4242 opaque = is_opaque (xmm_src_hi);
4243 zero = is_zero (xmm_src_hi);
4245 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4249 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4250 &xmm_dst_lo, &xmm_dst_hi);
4253 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4257 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4259 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4261 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4262 &xmm_dst_lo, &xmm_dst_hi);
4265 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4278 *dst++ = pack_1x64_32 (
4279 over_rev_non_pre_1x64 (
4280 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4289 /* -------------------------------------------------------------------------------------------------
4290 * composite_over_n_8888_0565_ca
4294 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4296 pixman_image_t * src_image,
4297 pixman_image_t * mask_image,
4298 pixman_image_t * dst_image,
4309 uint16_t *dst_line, *dst, d;
4310 uint32_t *mask_line, *mask, m;
4311 int dst_stride, mask_stride;
4315 __m128i xmm_src, xmm_alpha;
4316 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4317 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4319 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4321 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4326 PIXMAN_IMAGE_GET_LINE (
4327 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4328 PIXMAN_IMAGE_GET_LINE (
4329 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4331 xmm_src = expand_pixel_32_1x128 (src);
4332 xmm_alpha = expand_alpha_1x128 (xmm_src);
4333 mmx_src = _mm_movepi64_pi64 (xmm_src);
4334 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4341 mask_line += mask_stride;
4342 dst_line += dst_stride;
4344 while (w && ((unsigned long)dst & 15))
4346 m = *(uint32_t *) mask;
4351 mmx_mask = unpack_32_1x64 (m);
4352 mmx_dest = expand565_16_1x64 (d);
4354 *dst = pack_565_32_16 (
4357 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4368 xmm_mask = load_128_unaligned ((__m128i*)mask);
4369 xmm_dst = load_128_aligned ((__m128i*)dst);
4371 pack_cmp = _mm_movemask_epi8 (
4372 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4374 unpack_565_128_4x128 (xmm_dst,
4375 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4376 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4378 /* preload next round */
4379 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4381 /* preload next round */
4382 if (pack_cmp != 0xffff)
4384 in_over_2x128 (&xmm_src, &xmm_src,
4385 &xmm_alpha, &xmm_alpha,
4386 &xmm_mask_lo, &xmm_mask_hi,
4387 &xmm_dst0, &xmm_dst1);
4391 pack_cmp = _mm_movemask_epi8 (
4392 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4394 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4396 if (pack_cmp != 0xffff)
4398 in_over_2x128 (&xmm_src, &xmm_src,
4399 &xmm_alpha, &xmm_alpha,
4400 &xmm_mask_lo, &xmm_mask_hi,
4401 &xmm_dst2, &xmm_dst3);
4405 (__m128i*)dst, pack_565_4x128_128 (
4406 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4415 m = *(uint32_t *) mask;
4420 mmx_mask = unpack_32_1x64 (m);
4421 mmx_dest = expand565_16_1x64 (d);
4423 *dst = pack_565_32_16 (
4426 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4438 /* -----------------------------------------------------------------------
4439 * composite_in_n_8_8
4443 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4445 pixman_image_t * src_image,
4446 pixman_image_t * mask_image,
4447 pixman_image_t * dst_image,
4457 uint8_t *dst_line, *dst;
4458 uint8_t *mask_line, *mask;
4459 int dst_stride, mask_stride;
4466 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4467 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4469 PIXMAN_IMAGE_GET_LINE (
4470 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4471 PIXMAN_IMAGE_GET_LINE (
4472 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4474 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4478 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4483 dst_line += dst_stride;
4485 mask_line += mask_stride;
4488 while (w && ((unsigned long)dst & 15))
4490 m = (uint32_t) *mask++;
4491 d = (uint32_t) *dst;
4493 *dst++ = (uint8_t) pack_1x64_32 (
4495 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4496 unpack_32_1x64 (m)),
4497 unpack_32_1x64 (d)));
4503 xmm_mask = load_128_unaligned ((__m128i*)mask);
4504 xmm_dst = load_128_aligned ((__m128i*)dst);
4506 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4507 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4509 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4510 &xmm_mask_lo, &xmm_mask_hi,
4511 &xmm_mask_lo, &xmm_mask_hi);
4513 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4514 &xmm_dst_lo, &xmm_dst_hi,
4515 &xmm_dst_lo, &xmm_dst_hi);
4518 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4527 m = (uint32_t) *mask++;
4528 d = (uint32_t) *dst;
4530 *dst++ = (uint8_t) pack_1x64_32 (
4533 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4534 unpack_32_1x64 (d)));
4542 /* -----------------------------------------------------------------------
4547 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4549 pixman_image_t * src_image,
4550 pixman_image_t * mask_image,
4551 pixman_image_t * dst_image,
4561 uint8_t *dst_line, *dst;
4568 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4570 PIXMAN_IMAGE_GET_LINE (
4571 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4573 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4575 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4584 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4585 8, dest_x, dest_y, width, height, src);
4593 dst_line += dst_stride;
4596 while (w && ((unsigned long)dst & 15))
4598 d = (uint32_t) *dst;
4600 *dst++ = (uint8_t) pack_1x64_32 (
4602 _mm_movepi64_pi64 (xmm_alpha),
4603 unpack_32_1x64 (d)));
4609 xmm_dst = load_128_aligned ((__m128i*)dst);
4611 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4613 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4614 &xmm_dst_lo, &xmm_dst_hi,
4615 &xmm_dst_lo, &xmm_dst_hi);
4618 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4626 d = (uint32_t) *dst;
4628 *dst++ = (uint8_t) pack_1x64_32 (
4630 _mm_movepi64_pi64 (xmm_alpha),
4631 unpack_32_1x64 (d)));
4639 /* ---------------------------------------------------------------------------
4644 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4646 pixman_image_t * src_image,
4647 pixman_image_t * mask_image,
4648 pixman_image_t * dst_image,
4658 uint8_t *dst_line, *dst;
4659 uint8_t *src_line, *src;
4660 int src_stride, dst_stride;
4664 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4665 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4667 PIXMAN_IMAGE_GET_LINE (
4668 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4669 PIXMAN_IMAGE_GET_LINE (
4670 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4675 dst_line += dst_stride;
4677 src_line += src_stride;
4680 while (w && ((unsigned long)dst & 15))
4682 s = (uint32_t) *src++;
4683 d = (uint32_t) *dst;
4685 *dst++ = (uint8_t) pack_1x64_32 (
4687 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4693 xmm_src = load_128_unaligned ((__m128i*)src);
4694 xmm_dst = load_128_aligned ((__m128i*)dst);
4696 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4697 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4699 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4700 &xmm_dst_lo, &xmm_dst_hi,
4701 &xmm_dst_lo, &xmm_dst_hi);
4704 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4713 s = (uint32_t) *src++;
4714 d = (uint32_t) *dst;
4716 *dst++ = (uint8_t) pack_1x64_32 (
4717 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4725 /* -------------------------------------------------------------------------
4726 * composite_add_n_8_8
4730 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4732 pixman_image_t * src_image,
4733 pixman_image_t * mask_image,
4734 pixman_image_t * dst_image,
4744 uint8_t *dst_line, *dst;
4745 uint8_t *mask_line, *mask;
4746 int dst_stride, mask_stride;
4753 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4754 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4756 PIXMAN_IMAGE_GET_LINE (
4757 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4758 PIXMAN_IMAGE_GET_LINE (
4759 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4761 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4765 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4770 dst_line += dst_stride;
4772 mask_line += mask_stride;
4775 while (w && ((unsigned long)dst & 15))
4777 m = (uint32_t) *mask++;
4778 d = (uint32_t) *dst;
4780 *dst++ = (uint8_t) pack_1x64_32 (
4783 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4784 unpack_32_1x64 (d)));
4790 xmm_mask = load_128_unaligned ((__m128i*)mask);
4791 xmm_dst = load_128_aligned ((__m128i*)dst);
4793 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4794 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4796 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4797 &xmm_mask_lo, &xmm_mask_hi,
4798 &xmm_mask_lo, &xmm_mask_hi);
4800 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4801 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4804 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4813 m = (uint32_t) *mask++;
4814 d = (uint32_t) *dst;
4816 *dst++ = (uint8_t) pack_1x64_32 (
4819 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4820 unpack_32_1x64 (d)));
4829 /* -------------------------------------------------------------------------
4830 * composite_add_n_8_8
4834 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4836 pixman_image_t * src_image,
4837 pixman_image_t * mask_image,
4838 pixman_image_t * dst_image,
4848 uint8_t *dst_line, *dst;
4855 PIXMAN_IMAGE_GET_LINE (
4856 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4858 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4867 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4868 8, dest_x, dest_y, width, height, 0xff);
4873 src = (src << 24) | (src << 16) | (src << 8) | src;
4874 xmm_src = _mm_set_epi32 (src, src, src, src);
4879 dst_line += dst_stride;
4882 while (w && ((unsigned long)dst & 15))
4884 *dst = (uint8_t)_mm_cvtsi64_si32 (
4886 _mm_movepi64_pi64 (xmm_src),
4887 _mm_cvtsi32_si64 (*dst)));
4896 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4904 *dst = (uint8_t)_mm_cvtsi64_si32 (
4906 _mm_movepi64_pi64 (xmm_src),
4907 _mm_cvtsi32_si64 (*dst)));
4917 /* ----------------------------------------------------------------------
4922 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4924 pixman_image_t * src_image,
4925 pixman_image_t * mask_image,
4926 pixman_image_t * dst_image,
4936 uint8_t *dst_line, *dst;
4937 uint8_t *src_line, *src;
4938 int dst_stride, src_stride;
4942 PIXMAN_IMAGE_GET_LINE (
4943 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4944 PIXMAN_IMAGE_GET_LINE (
4945 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4952 dst_line += dst_stride;
4953 src_line += src_stride;
4957 while (w && (unsigned long)dst & 3)
4959 t = (*dst) + (*src++);
4960 *dst++ = t | (0 - (t >> 8));
4964 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4974 t = (*dst) + (*src++);
4975 *dst++ = t | (0 - (t >> 8));
4983 /* ---------------------------------------------------------------------
4984 * composite_add_8888_8888
4987 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4989 pixman_image_t * src_image,
4990 pixman_image_t * mask_image,
4991 pixman_image_t * dst_image,
5001 uint32_t *dst_line, *dst;
5002 uint32_t *src_line, *src;
5003 int dst_stride, src_stride;
5005 PIXMAN_IMAGE_GET_LINE (
5006 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5007 PIXMAN_IMAGE_GET_LINE (
5008 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5013 dst_line += dst_stride;
5015 src_line += src_stride;
5017 core_combine_add_u_sse2 (dst, src, NULL, width);
5023 /* -------------------------------------------------------------------------------------------------
5024 * sse2_composite_copy_area
5027 static pixman_bool_t
5028 pixman_blt_sse2 (uint32_t *src_bits,
5041 uint8_t * src_bytes;
5042 uint8_t * dst_bytes;
5045 if (src_bpp != dst_bpp)
5050 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5051 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5052 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5053 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5054 byte_width = 2 * width;
5058 else if (src_bpp == 32)
5060 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5061 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5062 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5063 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5064 byte_width = 4 * width;
5076 uint8_t *s = src_bytes;
5077 uint8_t *d = dst_bytes;
5078 src_bytes += src_stride;
5079 dst_bytes += dst_stride;
5082 while (w >= 2 && ((unsigned long)d & 3))
5084 *(uint16_t *)d = *(uint16_t *)s;
5090 while (w >= 4 && ((unsigned long)d & 15))
5092 *(uint32_t *)d = *(uint32_t *)s;
5101 __m128i xmm0, xmm1, xmm2, xmm3;
5103 xmm0 = load_128_unaligned ((__m128i*)(s));
5104 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5105 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5106 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5108 save_128_aligned ((__m128i*)(d), xmm0);
5109 save_128_aligned ((__m128i*)(d + 16), xmm1);
5110 save_128_aligned ((__m128i*)(d + 32), xmm2);
5111 save_128_aligned ((__m128i*)(d + 48), xmm3);
5120 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5129 *(uint32_t *)d = *(uint32_t *)s;
5138 *(uint16_t *)d = *(uint16_t *)s;
5151 sse2_composite_copy_area (pixman_implementation_t *imp,
5153 pixman_image_t * src_image,
5154 pixman_image_t * mask_image,
5155 pixman_image_t * dst_image,
5165 pixman_blt_sse2 (src_image->bits.bits,
5166 dst_image->bits.bits,
5167 src_image->bits.rowstride,
5168 dst_image->bits.rowstride,
5169 PIXMAN_FORMAT_BPP (src_image->bits.format),
5170 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5171 src_x, src_y, dest_x, dest_y, width, height);
5175 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5177 pixman_image_t * src_image,
5178 pixman_image_t * mask_image,
5179 pixman_image_t * dst_image,
5189 uint32_t *src, *src_line, s;
5190 uint32_t *dst, *dst_line, d;
5191 uint8_t *mask, *mask_line;
5193 int src_stride, mask_stride, dst_stride;
5197 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5198 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5199 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5201 PIXMAN_IMAGE_GET_LINE (
5202 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5203 PIXMAN_IMAGE_GET_LINE (
5204 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5205 PIXMAN_IMAGE_GET_LINE (
5206 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5211 src_line += src_stride;
5213 dst_line += dst_stride;
5215 mask_line += mask_stride;
5219 while (w && (unsigned long)dst & 15)
5221 s = 0xff000000 | *src++;
5222 m = (uint32_t) *mask++;
5224 ms = unpack_32_1x64 (s);
5228 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5229 __m64 md = unpack_32_1x64 (d);
5231 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5234 *dst++ = pack_1x64_32 (ms);
5240 m = *(uint32_t*) mask;
5241 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5243 if (m == 0xffffffff)
5245 save_128_aligned ((__m128i*)dst, xmm_src);
5249 xmm_dst = load_128_aligned ((__m128i*)dst);
5251 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5253 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5254 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5255 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5257 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5259 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5261 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5272 m = (uint32_t) *mask++;
5276 s = 0xff000000 | *src;
5288 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5289 md = unpack_32_1x64 (d);
5290 ms = unpack_32_1x64 (s);
5292 *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5307 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5309 pixman_image_t * src_image,
5310 pixman_image_t * mask_image,
5311 pixman_image_t * dst_image,
5321 uint32_t *src, *src_line, s;
5322 uint32_t *dst, *dst_line, d;
5323 uint8_t *mask, *mask_line;
5325 int src_stride, mask_stride, dst_stride;
5328 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5329 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5330 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5332 PIXMAN_IMAGE_GET_LINE (
5333 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5334 PIXMAN_IMAGE_GET_LINE (
5335 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5336 PIXMAN_IMAGE_GET_LINE (
5337 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5342 src_line += src_stride;
5344 dst_line += dst_stride;
5346 mask_line += mask_stride;
5350 while (w && (unsigned long)dst & 15)
5355 m = (uint32_t) *mask++;
5362 if (sa == 0xff && m == 0xff)
5368 __m64 ms, md, ma, msa;
5370 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5371 ms = unpack_32_1x64 (s);
5372 md = unpack_32_1x64 (d);
5374 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5376 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5386 m = *(uint32_t *) mask;
5390 xmm_src = load_128_unaligned ((__m128i*)src);
5392 if (m == 0xffffffff && is_opaque (xmm_src))
5394 save_128_aligned ((__m128i *)dst, xmm_src);
5398 xmm_dst = load_128_aligned ((__m128i *)dst);
5400 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5402 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5403 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5404 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5406 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5407 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5409 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5410 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5412 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5427 m = (uint32_t) *mask++;
5434 if (sa == 0xff && m == 0xff)
5440 __m64 ms, md, ma, msa;
5442 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5443 ms = unpack_32_1x64 (s);
5444 md = unpack_32_1x64 (d);
5446 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5448 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5461 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5463 pixman_image_t * src_image,
5464 pixman_image_t * mask_image,
5465 pixman_image_t * dst_image,
5476 uint32_t *dst_line, *dst;
5478 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5479 __m128i xmm_dsta_hi, xmm_dsta_lo;
5483 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5488 PIXMAN_IMAGE_GET_LINE (
5489 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5491 xmm_src = expand_pixel_32_1x128 (src);
5497 dst_line += dst_stride;
5500 while (w && (unsigned long)dst & 15)
5504 vd = unpack_32_1x64 (*dst);
5506 *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5507 _mm_movepi64_pi64 (xmm_src)));
5514 __m128i tmp_lo, tmp_hi;
5516 xmm_dst = load_128_aligned ((__m128i*)dst);
5518 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5519 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5524 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5525 &xmm_dsta_lo, &xmm_dsta_hi,
5529 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5539 vd = unpack_32_1x64 (*dst);
5541 *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5542 _mm_movepi64_pi64 (xmm_src)));
5553 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5555 pixman_image_t * src_image,
5556 pixman_image_t * mask_image,
5557 pixman_image_t * dst_image,
5567 uint32_t *src, *src_line, s;
5568 uint32_t *dst, *dst_line, d;
5569 uint32_t *mask, *mask_line;
5571 int src_stride, mask_stride, dst_stride;
5574 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5575 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5576 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5578 PIXMAN_IMAGE_GET_LINE (
5579 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5580 PIXMAN_IMAGE_GET_LINE (
5581 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5582 PIXMAN_IMAGE_GET_LINE (
5583 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5588 src_line += src_stride;
5590 dst_line += dst_stride;
5592 mask_line += mask_stride;
5596 while (w && (unsigned long)dst & 15)
5601 m = (*mask++) >> 24;
5608 if (sa == 0xff && m == 0xff)
5614 __m64 ms, md, ma, msa;
5616 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5617 ms = unpack_32_1x64 (s);
5618 md = unpack_32_1x64 (d);
5620 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5622 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5632 xmm_mask = load_128_unaligned ((__m128i*)mask);
5634 if (!is_transparent (xmm_mask))
5636 xmm_src = load_128_unaligned ((__m128i*)src);
5638 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5640 save_128_aligned ((__m128i *)dst, xmm_src);
5644 xmm_dst = load_128_aligned ((__m128i *)dst);
5646 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5647 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5648 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5650 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5651 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5653 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5654 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5656 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5671 m = (*mask++) >> 24;
5678 if (sa == 0xff && m == 0xff)
5684 __m64 ms, md, ma, msa;
5686 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5687 ms = unpack_32_1x64 (s);
5688 md = unpack_32_1x64 (d);
5690 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5692 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5704 /* A variant of 'core_combine_over_u_sse2' with minor tweaks */
5705 static force_inline void
5706 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5710 pixman_fixed_t unit_x,
5711 pixman_fixed_t max_vx)
5714 const uint32_t* pm = NULL;
5716 __m128i xmm_dst_lo, xmm_dst_hi;
5717 __m128i xmm_src_lo, xmm_src_hi;
5718 __m128i xmm_alpha_lo, xmm_alpha_hi;
5720 /* Align dst on a 16-byte boundary */
5721 while (w && ((unsigned long)pd & 15))
5724 s = combine1 (ps + (vx >> 16), pm);
5727 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5736 uint32_t tmp1, tmp2, tmp3, tmp4;
5738 tmp1 = ps[vx >> 16];
5740 tmp2 = ps[vx >> 16];
5742 tmp3 = ps[vx >> 16];
5744 tmp4 = ps[vx >> 16];
5747 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5749 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5751 if (is_opaque (xmm_src_hi))
5753 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5755 else if (!is_zero (xmm_src_hi))
5757 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5759 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5760 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5762 expand_alpha_2x128 (
5763 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5765 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5766 &xmm_alpha_lo, &xmm_alpha_hi,
5767 &xmm_dst_lo, &xmm_dst_hi);
5769 /* rebuid the 4 pixel data and save*/
5770 save_128_aligned ((__m128i*)pd,
5771 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5783 s = combine1 (ps + (vx >> 16), pm);
5786 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5795 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5796 scaled_nearest_scanline_sse2_8888_8888_OVER,
5797 uint32_t, uint32_t, COVER)
5798 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5799 scaled_nearest_scanline_sse2_8888_8888_OVER,
5800 uint32_t, uint32_t, NONE)
5801 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5802 scaled_nearest_scanline_sse2_8888_8888_OVER,
5803 uint32_t, uint32_t, PAD)
5805 static const pixman_fast_path_t sse2_fast_paths[] =
5807 /* PIXMAN_OP_OVER */
5808 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5809 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5810 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5811 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5812 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5813 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5814 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5815 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5816 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5817 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5818 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5819 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5820 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5821 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5822 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5823 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5824 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5825 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5826 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5827 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5828 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5829 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5830 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5831 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5832 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5833 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5834 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5835 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5836 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5837 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5838 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5839 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5840 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5841 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5842 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5843 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5844 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5845 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5846 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5847 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5848 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5849 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5850 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5851 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5852 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5853 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5855 /* PIXMAN_OP_OVER_REVERSE */
5856 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5857 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5860 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5861 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5862 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5863 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5864 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5865 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5868 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5869 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5870 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5871 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5872 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5873 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5874 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5875 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5876 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5877 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5878 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5879 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5880 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5881 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5884 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5885 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5886 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5888 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5889 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5890 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5891 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5892 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5893 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5894 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5895 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5896 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5897 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5898 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5899 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5904 static pixman_bool_t
5905 sse2_blt (pixman_implementation_t *imp,
5906 uint32_t * src_bits,
5907 uint32_t * dst_bits,
5919 if (!pixman_blt_sse2 (
5920 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5921 src_x, src_y, dst_x, dst_y, width, height))
5924 return _pixman_implementation_blt (
5926 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5927 src_x, src_y, dst_x, dst_y, width, height);
5933 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5934 __attribute__((__force_align_arg_pointer__))
5936 static pixman_bool_t
5937 sse2_fill (pixman_implementation_t *imp,
5947 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5949 return _pixman_implementation_fill (
5950 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5957 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5959 int w = iter->width;
5960 __m128i ff000000 = mask_ff000000;
5961 uint32_t *dst = iter->buffer;
5962 uint32_t *src = (uint32_t *)iter->bits;
5964 iter->bits += iter->stride;
5966 while (w && ((unsigned long)dst) & 0x0f)
5968 *dst++ = (*src++) | 0xff000000;
5975 (__m128i *)dst, _mm_or_si128 (
5976 load_128_unaligned ((__m128i *)src), ff000000));
5985 *dst++ = (*src++) | 0xff000000;
5989 return iter->buffer;
5993 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5995 int w = iter->width;
5996 uint32_t *dst = iter->buffer;
5997 uint8_t *src = iter->bits;
5998 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6000 iter->bits += iter->stride;
6002 while (w && (((unsigned long)dst) & 15))
6004 *dst++ = *(src++) << 24;
6010 xmm0 = _mm_loadu_si128((__m128i *)src);
6012 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
6013 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
6014 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6015 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6016 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6017 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6019 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
6020 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
6021 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
6022 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6031 *dst++ = *(src++) << 24;
6035 return iter->buffer;
6040 pixman_format_code_t format;
6041 pixman_iter_get_scanline_t get_scanline;
6044 static const fetcher_info_t fetchers[] =
6046 { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
6047 { PIXMAN_a8, sse2_fetch_a8 },
6052 sse2_src_iter_init (pixman_implementation_t *imp,
6053 pixman_iter_t *iter,
6054 pixman_image_t *image,
6055 int x, int y, int width, int height,
6056 uint8_t *buffer, iter_flags_t flags)
6059 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
6061 if ((flags & ITER_NARROW) &&
6062 (image->common.flags & FLAGS) == FLAGS &&
6064 x + width <= image->bits.width &&
6065 y + height <= image->bits.height)
6067 const fetcher_info_t *f;
6069 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6071 if (image->common.extended_format_code == f->format)
6073 uint8_t *b = (uint8_t *)image->bits.bits;
6074 int s = image->bits.rowstride * 4;
6076 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6078 iter->width = width;
6079 iter->buffer = (uint32_t *)buffer;
6081 iter->get_scanline = f->get_scanline;
6087 _pixman_implementation_src_iter_init (
6088 imp->delegate, iter, image, x, y, width, height, buffer, flags);
6091 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6092 __attribute__((__force_align_arg_pointer__))
6094 pixman_implementation_t *
6095 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6097 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6099 /* SSE2 constants */
6100 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6101 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6102 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6103 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6104 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6105 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6106 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6107 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6108 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6109 mask_0080 = create_mask_16_128 (0x0080);
6110 mask_00ff = create_mask_16_128 (0x00ff);
6111 mask_0101 = create_mask_16_128 (0x0101);
6112 mask_ffff = create_mask_16_128 (0xffff);
6113 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6114 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6117 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
6118 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
6120 mask_x0080 = create_mask_16_64 (0x0080);
6121 mask_x00ff = create_mask_16_64 (0x00ff);
6122 mask_x0101 = create_mask_16_64 (0x0101);
6123 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
6127 /* Set up function pointers */
6129 /* SSE code patch for fbcompose.c */
6130 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6131 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6132 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6133 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6134 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6135 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6136 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6137 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6138 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6139 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6141 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6143 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6144 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6145 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6146 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6147 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6148 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6149 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6150 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6151 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6152 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6153 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6155 imp->blt = sse2_blt;
6156 imp->fill = sse2_fill;
6158 imp->src_iter_init = sse2_src_iter_init;
6163 #endif /* USE_SSE2 */