2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41 * the pixman-x64-mmx-emulation.h file contains
42 * implementations of those MMX intrinsics that
43 * are used in the SSE2 implementation.
45 # include "pixman-x64-mmx-emulation.h"
50 /* --------------------------------------------------------------------
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
79 /* ----------------------------------------------------------------------
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
85 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
91 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
98 __m128i r, g, b, rb, t;
100 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
104 rb = _mm_or_si128 (r, b);
105 t = _mm_and_si128 (rb, mask_565_fix_rb);
106 t = _mm_srli_epi32 (t, 5);
107 rb = _mm_or_si128 (rb, t);
109 t = _mm_and_si128 (g, mask_565_fix_g);
110 t = _mm_srli_epi32 (t, 6);
111 g = _mm_or_si128 (g, t);
113 return _mm_or_si128 (rb, g);
116 static force_inline void
117 unpack_565_128_4x128 (__m128i data,
125 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
128 lo = unpack_565_to_8888 (lo);
129 hi = unpack_565_to_8888 (hi);
131 unpack_128_2x128 (lo, data0, data1);
132 unpack_128_2x128 (hi, data2, data3);
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
138 return (uint16_t) (((pixel >> 8) & 0xf800) |
139 ((pixel >> 5) & 0x07e0) |
140 ((pixel >> 3) & 0x001f));
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
146 return _mm_packus_epi16 (lo, hi);
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
153 __m128i r, g1, g2, b;
155 data = pack_2x128_128 (lo, hi);
157 r = _mm_and_si128 (data, mask_565_r);
158 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
162 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
168 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169 pack_565_2x128_128 (*xmm2, *xmm3));
172 static force_inline int
173 is_opaque (__m128i x)
175 __m128i ffs = _mm_cmpeq_epi8 (x, x);
177 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
180 static force_inline int
183 return _mm_movemask_epi8 (
184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
187 static force_inline int
188 is_transparent (__m128i x)
190 return (_mm_movemask_epi8 (
191 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
197 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
203 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204 _MM_SHUFFLE (3, 3, 3, 3)),
205 _MM_SHUFFLE (3, 3, 3, 3));
208 static force_inline void
209 expand_alpha_2x128 (__m128i data_lo,
216 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
219 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i data_lo,
231 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
247 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249 lo = _mm_adds_epu16 (lo, mask_0080);
250 hi = _mm_adds_epu16 (hi, mask_0080);
251 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
258 __m128i* alpha_dst_lo,
259 __m128i* alpha_dst_hi,
262 __m128i* alpha_src_lo,
263 __m128i* alpha_src_hi,
267 __m128i t1_lo, t1_hi;
268 __m128i t2_lo, t2_hi;
270 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
273 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
277 static force_inline void
278 negate_2x128 (__m128i data_lo,
283 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
287 static force_inline void
288 invert_colors_2x128 (__m128i data_lo,
295 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
311 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
313 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
315 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i src_lo,
326 __m128i alpha_lo, alpha_hi;
328 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
330 lo = _mm_or_si128 (alpha_lo, mask_alpha);
331 hi = _mm_or_si128 (alpha_hi, mask_alpha);
333 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
335 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
337 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
353 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
356 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
359 static force_inline void
360 cache_prefetch (__m128i* addr)
362 _mm_prefetch (addr, _MM_HINT_T0);
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
368 _mm_prefetch (addr + 4, _MM_HINT_T0); /* 64 bytes ahead */
371 /* load 4 pixels from a 16-byte boundary aligned address */
372 static force_inline __m128i
373 load_128_aligned (__m128i* src)
375 return _mm_load_si128 (src);
378 /* load 4 pixels from a unaligned address */
379 static force_inline __m128i
380 load_128_unaligned (const __m128i* src)
382 return _mm_loadu_si128 (src);
385 /* save 4 pixels using Write Combining memory on a 16-byte
386 * boundary aligned address
388 static force_inline void
389 save_128_write_combining (__m128i* dst,
392 _mm_stream_si128 (dst, data);
395 /* save 4 pixels on a 16-byte boundary aligned address */
396 static force_inline void
397 save_128_aligned (__m128i* dst,
400 _mm_store_si128 (dst, data);
403 /* save 4 pixels on a unaligned address */
404 static force_inline void
405 save_128_unaligned (__m128i* dst,
408 _mm_storeu_si128 (dst, data);
411 /* ------------------------------------------------------------------
415 static force_inline __m64
416 unpack_32_1x64 (uint32_t data)
418 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
421 static force_inline __m64
422 expand_alpha_1x64 (__m64 data)
424 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
427 static force_inline __m64
428 expand_alpha_rev_1x64 (__m64 data)
430 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
433 static force_inline __m64
434 expand_pixel_8_1x64 (uint8_t data)
436 return _mm_shuffle_pi16 (
437 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
440 static force_inline __m64
441 pix_multiply_1x64 (__m64 data,
444 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
449 static force_inline __m64
450 pix_add_multiply_1x64 (__m64* src,
455 __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
456 __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
458 return _mm_adds_pu8 (t1, t2);
461 static force_inline __m64
462 negate_1x64 (__m64 data)
464 return _mm_xor_si64 (data, mask_x00ff);
467 static force_inline __m64
468 invert_colors_1x64 (__m64 data)
470 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
473 static force_inline __m64
474 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
476 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
479 static force_inline __m64
480 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
482 return over_1x64 (pix_multiply_1x64 (*src, *mask),
483 pix_multiply_1x64 (*alpha, *mask),
487 static force_inline __m64
488 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
490 __m64 alpha = expand_alpha_1x64 (src);
492 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
493 _mm_or_si64 (alpha, mask_x_alpha)),
498 static force_inline uint32_t
499 pack_1x64_32 (__m64 data)
501 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
504 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
508 * --- Expanding 565 in the low word ---
510 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
511 * m = m & (01f0003f001f);
512 * m = m * (008404100840);
515 * Note the trick here - the top word is shifted by another nibble to
516 * avoid it bumping into the middle word
518 static force_inline __m64
519 expand565_16_1x64 (uint16_t pixel)
524 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
526 t1 = _mm_slli_si64 (p, 36 - 11);
527 t2 = _mm_slli_si64 (p, 16 - 5);
529 p = _mm_or_si64 (t1, p);
530 p = _mm_or_si64 (t2, p);
531 p = _mm_and_si64 (p, mask_x565_rgb);
532 p = _mm_mullo_pi16 (p, mask_x565_unpack);
534 return _mm_srli_pi16 (p, 8);
537 /* ----------------------------------------------------------------------------
538 * Compose Core transformations
540 static force_inline uint32_t
541 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
554 ms = unpack_32_1x64 (src);
555 return pack_1x64_32 (
556 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
562 static force_inline uint32_t
563 combine1 (const uint32_t *ps, const uint32_t *pm)
571 mm = unpack_32_1x64 (*pm);
572 mm = expand_alpha_1x64 (mm);
574 ms = unpack_32_1x64 (s);
575 ms = pix_multiply_1x64 (ms, mm);
577 s = pack_1x64_32 (ms);
583 static force_inline __m128i
584 combine4 (const __m128i *ps, const __m128i *pm)
586 __m128i xmm_src_lo, xmm_src_hi;
587 __m128i xmm_msk_lo, xmm_msk_hi;
592 xmm_msk_lo = load_128_unaligned (pm);
594 if (is_transparent (xmm_msk_lo))
595 return _mm_setzero_si128 ();
598 s = load_128_unaligned (ps);
602 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
603 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
605 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
607 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
608 &xmm_msk_lo, &xmm_msk_hi,
609 &xmm_src_lo, &xmm_src_hi);
611 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
617 static force_inline void
618 core_combine_over_u_sse2 (uint32_t* pd,
625 __m128i xmm_dst_lo, xmm_dst_hi;
626 __m128i xmm_src_lo, xmm_src_hi;
627 __m128i xmm_alpha_lo, xmm_alpha_hi;
629 /* call prefetch hint to optimize cache load*/
630 cache_prefetch ((__m128i*)ps);
631 cache_prefetch ((__m128i*)pd);
632 cache_prefetch ((__m128i*)pm);
634 /* Align dst on a 16-byte boundary */
635 while (w && ((unsigned long)pd & 15))
638 s = combine1 (ps, pm);
640 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
647 /* call prefetch hint to optimize cache load*/
648 cache_prefetch ((__m128i*)ps);
649 cache_prefetch ((__m128i*)pd);
650 cache_prefetch ((__m128i*)pm);
654 /* fill cache line with next memory */
655 cache_prefetch_next ((__m128i*)ps);
656 cache_prefetch_next ((__m128i*)pd);
657 cache_prefetch_next ((__m128i*)pm);
659 /* I'm loading unaligned because I'm not sure about
660 * the address alignment.
662 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
664 if (is_opaque (xmm_src_hi))
666 save_128_aligned ((__m128i*)pd, xmm_src_hi);
668 else if (!is_zero (xmm_src_hi))
670 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
672 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
673 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
676 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
678 over_2x128 (&xmm_src_lo, &xmm_src_hi,
679 &xmm_alpha_lo, &xmm_alpha_hi,
680 &xmm_dst_lo, &xmm_dst_hi);
682 /* rebuid the 4 pixel data and save*/
683 save_128_aligned ((__m128i*)pd,
684 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
697 s = combine1 (ps, pm);
699 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
708 static force_inline void
709 core_combine_over_reverse_u_sse2 (uint32_t* pd,
716 __m128i xmm_dst_lo, xmm_dst_hi;
717 __m128i xmm_src_lo, xmm_src_hi;
718 __m128i xmm_alpha_lo, xmm_alpha_hi;
720 /* call prefetch hint to optimize cache load*/
721 cache_prefetch ((__m128i*)ps);
722 cache_prefetch ((__m128i*)pd);
723 cache_prefetch ((__m128i*)pm);
725 /* Align dst on a 16-byte boundary */
727 ((unsigned long)pd & 15))
730 s = combine1 (ps, pm);
732 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
739 /* call prefetch hint to optimize cache load*/
740 cache_prefetch ((__m128i*)ps);
741 cache_prefetch ((__m128i*)pd);
742 cache_prefetch ((__m128i*)pm);
746 /* fill cache line with next memory */
747 cache_prefetch_next ((__m128i*)ps);
748 cache_prefetch_next ((__m128i*)pd);
749 cache_prefetch_next ((__m128i*)pm);
751 /* I'm loading unaligned because I'm not sure
752 * about the address alignment.
754 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
755 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
757 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
758 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
760 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
761 &xmm_alpha_lo, &xmm_alpha_hi);
763 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
764 &xmm_alpha_lo, &xmm_alpha_hi,
765 &xmm_src_lo, &xmm_src_hi);
767 /* rebuid the 4 pixel data and save*/
768 save_128_aligned ((__m128i*)pd,
769 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
782 s = combine1 (ps, pm);
784 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
792 static force_inline uint32_t
793 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
795 uint32_t maska = src >> 24;
801 else if (maska != 0xff)
803 return pack_1x64_32 (
804 pix_multiply_1x64 (unpack_32_1x64 (dst),
805 expand_alpha_1x64 (unpack_32_1x64 (src))));
811 static force_inline void
812 core_combine_in_u_sse2 (uint32_t* pd,
819 __m128i xmm_src_lo, xmm_src_hi;
820 __m128i xmm_dst_lo, xmm_dst_hi;
822 /* call prefetch hint to optimize cache load*/
823 cache_prefetch ((__m128i*)ps);
824 cache_prefetch ((__m128i*)pd);
825 cache_prefetch ((__m128i*)pm);
827 while (w && ((unsigned long) pd & 15))
829 s = combine1 (ps, pm);
832 *pd++ = core_combine_in_u_pixelsse2 (d, s);
839 /* call prefetch hint to optimize cache load*/
840 cache_prefetch ((__m128i*)ps);
841 cache_prefetch ((__m128i*)pd);
842 cache_prefetch ((__m128i*)pm);
846 /* fill cache line with next memory */
847 cache_prefetch_next ((__m128i*)ps);
848 cache_prefetch_next ((__m128i*)pd);
849 cache_prefetch_next ((__m128i*)pm);
851 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
852 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
854 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
855 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
857 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
858 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
859 &xmm_dst_lo, &xmm_dst_hi,
860 &xmm_dst_lo, &xmm_dst_hi);
862 save_128_aligned ((__m128i*)pd,
863 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
874 s = combine1 (ps, pm);
877 *pd++ = core_combine_in_u_pixelsse2 (d, s);
885 static force_inline void
886 core_combine_reverse_in_u_sse2 (uint32_t* pd,
893 __m128i xmm_src_lo, xmm_src_hi;
894 __m128i xmm_dst_lo, xmm_dst_hi;
896 /* call prefetch hint to optimize cache load*/
897 cache_prefetch ((__m128i*)ps);
898 cache_prefetch ((__m128i*)pd);
899 cache_prefetch ((__m128i*)pm);
901 while (w && ((unsigned long) pd & 15))
903 s = combine1 (ps, pm);
906 *pd++ = core_combine_in_u_pixelsse2 (s, d);
913 /* call prefetch hint to optimize cache load*/
914 cache_prefetch ((__m128i*)ps);
915 cache_prefetch ((__m128i*)pd);
916 cache_prefetch ((__m128i*)pm);
920 /* fill cache line with next memory */
921 cache_prefetch_next ((__m128i*)ps);
922 cache_prefetch_next ((__m128i*)pd);
923 cache_prefetch_next ((__m128i*)pm);
925 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
926 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
928 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
929 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
931 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
932 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
933 &xmm_src_lo, &xmm_src_hi,
934 &xmm_dst_lo, &xmm_dst_hi);
937 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
948 s = combine1 (ps, pm);
951 *pd++ = core_combine_in_u_pixelsse2 (s, d);
959 static force_inline void
960 core_combine_reverse_out_u_sse2 (uint32_t* pd,
965 /* call prefetch hint to optimize cache load*/
966 cache_prefetch ((__m128i*)ps);
967 cache_prefetch ((__m128i*)pd);
968 cache_prefetch ((__m128i*)pm);
970 while (w && ((unsigned long) pd & 15))
972 uint32_t s = combine1 (ps, pm);
975 *pd++ = pack_1x64_32 (
977 unpack_32_1x64 (d), negate_1x64 (
978 expand_alpha_1x64 (unpack_32_1x64 (s)))));
986 /* call prefetch hint to optimize cache load*/
987 cache_prefetch ((__m128i*)ps);
988 cache_prefetch ((__m128i*)pd);
989 cache_prefetch ((__m128i*)pm);
993 __m128i xmm_src_lo, xmm_src_hi;
994 __m128i xmm_dst_lo, xmm_dst_hi;
996 /* fill cache line with next memory */
997 cache_prefetch_next ((__m128i*)ps);
998 cache_prefetch_next ((__m128i*)pd);
999 cache_prefetch_next ((__m128i*)pm);
1001 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1002 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1004 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1005 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1007 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1008 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1010 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1011 &xmm_src_lo, &xmm_src_hi,
1012 &xmm_dst_lo, &xmm_dst_hi);
1015 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1027 uint32_t s = combine1 (ps, pm);
1030 *pd++ = pack_1x64_32 (
1032 unpack_32_1x64 (d), negate_1x64 (
1033 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1041 static force_inline void
1042 core_combine_out_u_sse2 (uint32_t* pd,
1047 /* call prefetch hint to optimize cache load*/
1048 cache_prefetch ((__m128i*)ps);
1049 cache_prefetch ((__m128i*)pd);
1050 cache_prefetch ((__m128i*)pm);
1052 while (w && ((unsigned long) pd & 15))
1054 uint32_t s = combine1 (ps, pm);
1057 *pd++ = pack_1x64_32 (
1059 unpack_32_1x64 (s), negate_1x64 (
1060 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1067 /* call prefetch hint to optimize cache load*/
1068 cache_prefetch ((__m128i*)ps);
1069 cache_prefetch ((__m128i*)pd);
1070 cache_prefetch ((__m128i*)pm);
1074 __m128i xmm_src_lo, xmm_src_hi;
1075 __m128i xmm_dst_lo, xmm_dst_hi;
1077 /* fill cache line with next memory */
1078 cache_prefetch_next ((__m128i*)ps);
1079 cache_prefetch_next ((__m128i*)pd);
1080 cache_prefetch_next ((__m128i*)pm);
1082 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1083 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1085 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1086 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1088 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1089 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1091 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1092 &xmm_dst_lo, &xmm_dst_hi,
1093 &xmm_dst_lo, &xmm_dst_hi);
1096 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1107 uint32_t s = combine1 (ps, pm);
1110 *pd++ = pack_1x64_32 (
1112 unpack_32_1x64 (s), negate_1x64 (
1113 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1121 static force_inline uint32_t
1122 core_combine_atop_u_pixel_sse2 (uint32_t src,
1125 __m64 s = unpack_32_1x64 (src);
1126 __m64 d = unpack_32_1x64 (dst);
1128 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1129 __m64 da = expand_alpha_1x64 (d);
1131 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1134 static force_inline void
1135 core_combine_atop_u_sse2 (uint32_t* pd,
1142 __m128i xmm_src_lo, xmm_src_hi;
1143 __m128i xmm_dst_lo, xmm_dst_hi;
1144 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1145 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1147 /* call prefetch hint to optimize cache load*/
1148 cache_prefetch ((__m128i*)ps);
1149 cache_prefetch ((__m128i*)pd);
1150 cache_prefetch ((__m128i*)pm);
1152 while (w && ((unsigned long) pd & 15))
1154 s = combine1 (ps, pm);
1157 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1164 /* call prefetch hint to optimize cache load*/
1165 cache_prefetch ((__m128i*)ps);
1166 cache_prefetch ((__m128i*)pd);
1167 cache_prefetch ((__m128i*)pm);
1171 /* fill cache line with next memory */
1172 cache_prefetch_next ((__m128i*)ps);
1173 cache_prefetch_next ((__m128i*)pd);
1174 cache_prefetch_next ((__m128i*)pm);
1176 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1177 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1179 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1180 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1182 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1183 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1184 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1185 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1187 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1188 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1190 pix_add_multiply_2x128 (
1191 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1192 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1193 &xmm_dst_lo, &xmm_dst_hi);
1196 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1207 s = combine1 (ps, pm);
1210 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1218 static force_inline uint32_t
1219 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1222 __m64 s = unpack_32_1x64 (src);
1223 __m64 d = unpack_32_1x64 (dst);
1225 __m64 sa = expand_alpha_1x64 (s);
1226 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1228 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1231 static force_inline void
1232 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1239 __m128i xmm_src_lo, xmm_src_hi;
1240 __m128i xmm_dst_lo, xmm_dst_hi;
1241 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1242 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1244 /* call prefetch hint to optimize cache load*/
1245 cache_prefetch ((__m128i*)ps);
1246 cache_prefetch ((__m128i*)pd);
1247 cache_prefetch ((__m128i*)pm);
1249 while (w && ((unsigned long) pd & 15))
1251 s = combine1 (ps, pm);
1254 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1261 /* call prefetch hint to optimize cache load*/
1262 cache_prefetch ((__m128i*)ps);
1263 cache_prefetch ((__m128i*)pd);
1264 cache_prefetch ((__m128i*)pm);
1268 /* fill cache line with next memory */
1269 cache_prefetch_next ((__m128i*)ps);
1270 cache_prefetch_next ((__m128i*)pd);
1271 cache_prefetch_next ((__m128i*)pm);
1273 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1274 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1276 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1277 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1279 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1280 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1281 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1282 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1284 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1285 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1287 pix_add_multiply_2x128 (
1288 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1289 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1290 &xmm_dst_lo, &xmm_dst_hi);
1293 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1304 s = combine1 (ps, pm);
1307 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1315 static force_inline uint32_t
1316 core_combine_xor_u_pixel_sse2 (uint32_t src,
1319 __m64 s = unpack_32_1x64 (src);
1320 __m64 d = unpack_32_1x64 (dst);
1322 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1323 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1325 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1328 static force_inline void
1329 core_combine_xor_u_sse2 (uint32_t* dst,
1330 const uint32_t* src,
1331 const uint32_t *mask,
1337 const uint32_t* ps = src;
1338 const uint32_t* pm = mask;
1340 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1341 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1342 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1343 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1345 /* call prefetch hint to optimize cache load*/
1346 cache_prefetch ((__m128i*)ps);
1347 cache_prefetch ((__m128i*)pd);
1348 cache_prefetch ((__m128i*)pm);
1350 while (w && ((unsigned long) pd & 15))
1352 s = combine1 (ps, pm);
1355 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1362 /* call prefetch hint to optimize cache load*/
1363 cache_prefetch ((__m128i*)ps);
1364 cache_prefetch ((__m128i*)pd);
1365 cache_prefetch ((__m128i*)pm);
1369 /* fill cache line with next memory */
1370 cache_prefetch_next ((__m128i*)ps);
1371 cache_prefetch_next ((__m128i*)pd);
1372 cache_prefetch_next ((__m128i*)pm);
1374 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1375 xmm_dst = load_128_aligned ((__m128i*) pd);
1377 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1378 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1380 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1381 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1382 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1383 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1385 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1386 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1387 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1388 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1390 pix_add_multiply_2x128 (
1391 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1392 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1393 &xmm_dst_lo, &xmm_dst_hi);
1396 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1407 s = combine1 (ps, pm);
1410 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1418 static force_inline void
1419 core_combine_add_u_sse2 (uint32_t* dst,
1420 const uint32_t* src,
1421 const uint32_t* mask,
1427 const uint32_t* ps = src;
1428 const uint32_t* pm = mask;
1430 /* call prefetch hint to optimize cache load*/
1431 cache_prefetch ((__m128i*)ps);
1432 cache_prefetch ((__m128i*)pd);
1433 cache_prefetch ((__m128i*)pm);
1435 while (w && (unsigned long)pd & 15)
1437 s = combine1 (ps, pm);
1443 *pd++ = _mm_cvtsi64_si32 (
1444 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1448 /* call prefetch hint to optimize cache load*/
1449 cache_prefetch ((__m128i*)ps);
1450 cache_prefetch ((__m128i*)pd);
1451 cache_prefetch ((__m128i*)pm);
1457 /* fill cache line with next memory */
1458 cache_prefetch_next ((__m128i*)ps);
1459 cache_prefetch_next ((__m128i*)pd);
1460 cache_prefetch_next ((__m128i*)pm);
1462 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1465 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1476 s = combine1 (ps, pm);
1480 *pd++ = _mm_cvtsi64_si32 (
1481 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1487 static force_inline uint32_t
1488 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1491 __m64 ms = unpack_32_1x64 (src);
1492 __m64 md = unpack_32_1x64 (dst);
1493 uint32_t sa = src >> 24;
1494 uint32_t da = ~dst >> 24;
1498 ms = pix_multiply_1x64 (
1499 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1502 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1505 static force_inline void
1506 core_combine_saturate_u_sse2 (uint32_t * pd,
1514 __m128i xmm_src, xmm_dst;
1516 /* call prefetch hint to optimize cache load*/
1517 cache_prefetch ((__m128i*)ps);
1518 cache_prefetch ((__m128i*)pd);
1519 cache_prefetch ((__m128i*)pm);
1521 while (w && (unsigned long)pd & 15)
1523 s = combine1 (ps, pm);
1526 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1533 /* call prefetch hint to optimize cache load*/
1534 cache_prefetch ((__m128i*)ps);
1535 cache_prefetch ((__m128i*)pd);
1536 cache_prefetch ((__m128i*)pm);
1540 /* fill cache line with next memory */
1541 cache_prefetch_next ((__m128i*)ps);
1542 cache_prefetch_next ((__m128i*)pd);
1543 cache_prefetch_next ((__m128i*)pm);
1545 xmm_dst = load_128_aligned ((__m128i*)pd);
1546 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1548 pack_cmp = _mm_movemask_epi8 (
1550 _mm_srli_epi32 (xmm_src, 24),
1551 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1553 /* if some alpha src is grater than respective ~alpha dst */
1556 s = combine1 (ps++, pm);
1558 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1562 s = combine1 (ps++, pm);
1564 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1568 s = combine1 (ps++, pm);
1570 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1574 s = combine1 (ps++, pm);
1576 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1582 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1595 s = combine1 (ps, pm);
1598 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1605 static force_inline void
1606 core_combine_src_ca_sse2 (uint32_t* pd,
1613 __m128i xmm_src_lo, xmm_src_hi;
1614 __m128i xmm_mask_lo, xmm_mask_hi;
1615 __m128i xmm_dst_lo, xmm_dst_hi;
1617 /* call prefetch hint to optimize cache load*/
1618 cache_prefetch ((__m128i*)ps);
1619 cache_prefetch ((__m128i*)pd);
1620 cache_prefetch ((__m128i*)pm);
1622 while (w && (unsigned long)pd & 15)
1626 *pd++ = pack_1x64_32 (
1627 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1631 /* call prefetch hint to optimize cache load*/
1632 cache_prefetch ((__m128i*)ps);
1633 cache_prefetch ((__m128i*)pd);
1634 cache_prefetch ((__m128i*)pm);
1638 /* fill cache line with next memory */
1639 cache_prefetch_next ((__m128i*)ps);
1640 cache_prefetch_next ((__m128i*)pd);
1641 cache_prefetch_next ((__m128i*)pm);
1643 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1644 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1646 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1647 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1649 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1650 &xmm_mask_lo, &xmm_mask_hi,
1651 &xmm_dst_lo, &xmm_dst_hi);
1654 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1666 *pd++ = pack_1x64_32 (
1667 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1672 static force_inline uint32_t
1673 core_combine_over_ca_pixel_sse2 (uint32_t src,
1677 __m64 s = unpack_32_1x64 (src);
1678 __m64 expAlpha = expand_alpha_1x64 (s);
1679 __m64 unpk_mask = unpack_32_1x64 (mask);
1680 __m64 unpk_dst = unpack_32_1x64 (dst);
1682 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1685 static force_inline void
1686 core_combine_over_ca_sse2 (uint32_t* pd,
1693 __m128i xmm_alpha_lo, xmm_alpha_hi;
1694 __m128i xmm_src_lo, xmm_src_hi;
1695 __m128i xmm_dst_lo, xmm_dst_hi;
1696 __m128i xmm_mask_lo, xmm_mask_hi;
1698 /* call prefetch hint to optimize cache load*/
1699 cache_prefetch ((__m128i*)ps);
1700 cache_prefetch ((__m128i*)pd);
1701 cache_prefetch ((__m128i*)pm);
1703 while (w && (unsigned long)pd & 15)
1709 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1713 /* call prefetch hint to optimize cache load*/
1714 cache_prefetch ((__m128i*)ps);
1715 cache_prefetch ((__m128i*)pd);
1716 cache_prefetch ((__m128i*)pm);
1720 /* fill cache line with next memory */
1721 cache_prefetch_next ((__m128i*)ps);
1722 cache_prefetch_next ((__m128i*)pd);
1723 cache_prefetch_next ((__m128i*)pm);
1725 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1726 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1727 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1729 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1730 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1731 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1733 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1734 &xmm_alpha_lo, &xmm_alpha_hi);
1736 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1737 &xmm_alpha_lo, &xmm_alpha_hi,
1738 &xmm_mask_lo, &xmm_mask_hi,
1739 &xmm_dst_lo, &xmm_dst_hi);
1742 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1756 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1761 static force_inline uint32_t
1762 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1766 __m64 d = unpack_32_1x64 (dst);
1768 return pack_1x64_32 (
1769 over_1x64 (d, expand_alpha_1x64 (d),
1770 pix_multiply_1x64 (unpack_32_1x64 (src),
1771 unpack_32_1x64 (mask))));
1774 static force_inline void
1775 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1782 __m128i xmm_alpha_lo, xmm_alpha_hi;
1783 __m128i xmm_src_lo, xmm_src_hi;
1784 __m128i xmm_dst_lo, xmm_dst_hi;
1785 __m128i xmm_mask_lo, xmm_mask_hi;
1787 /* call prefetch hint to optimize cache load*/
1788 cache_prefetch ((__m128i*)ps);
1789 cache_prefetch ((__m128i*)pd);
1790 cache_prefetch ((__m128i*)pm);
1792 while (w && (unsigned long)pd & 15)
1798 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1802 /* call prefetch hint to optimize cache load*/
1803 cache_prefetch ((__m128i*)ps);
1804 cache_prefetch ((__m128i*)pd);
1805 cache_prefetch ((__m128i*)pm);
1809 /* fill cache line with next memory */
1810 cache_prefetch_next ((__m128i*)ps);
1811 cache_prefetch_next ((__m128i*)pd);
1812 cache_prefetch_next ((__m128i*)pm);
1814 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1815 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1816 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1818 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1819 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1820 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1822 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1823 &xmm_alpha_lo, &xmm_alpha_hi);
1824 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1825 &xmm_mask_lo, &xmm_mask_hi,
1826 &xmm_mask_lo, &xmm_mask_hi);
1828 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1829 &xmm_alpha_lo, &xmm_alpha_hi,
1830 &xmm_mask_lo, &xmm_mask_hi);
1833 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1847 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1852 static force_inline void
1853 core_combine_in_ca_sse2 (uint32_t * pd,
1860 __m128i xmm_alpha_lo, xmm_alpha_hi;
1861 __m128i xmm_src_lo, xmm_src_hi;
1862 __m128i xmm_dst_lo, xmm_dst_hi;
1863 __m128i xmm_mask_lo, xmm_mask_hi;
1865 /* call prefetch hint to optimize cache load*/
1866 cache_prefetch ((__m128i*)ps);
1867 cache_prefetch ((__m128i*)pd);
1868 cache_prefetch ((__m128i*)pm);
1870 while (w && (unsigned long)pd & 15)
1876 *pd++ = pack_1x64_32 (
1878 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1879 expand_alpha_1x64 (unpack_32_1x64 (d))));
1884 /* call prefetch hint to optimize cache load*/
1885 cache_prefetch ((__m128i*)ps);
1886 cache_prefetch ((__m128i*)pd);
1887 cache_prefetch ((__m128i*)pm);
1891 /* fill cache line with next memory */
1892 cache_prefetch_next ((__m128i*)ps);
1893 cache_prefetch_next ((__m128i*)pd);
1894 cache_prefetch_next ((__m128i*)pm);
1896 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1897 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1898 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1900 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1901 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1902 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1904 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1905 &xmm_alpha_lo, &xmm_alpha_hi);
1907 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1908 &xmm_mask_lo, &xmm_mask_hi,
1909 &xmm_dst_lo, &xmm_dst_hi);
1911 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1912 &xmm_alpha_lo, &xmm_alpha_hi,
1913 &xmm_dst_lo, &xmm_dst_hi);
1916 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1930 *pd++ = pack_1x64_32 (
1933 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1934 expand_alpha_1x64 (unpack_32_1x64 (d))));
1940 static force_inline void
1941 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1948 __m128i xmm_alpha_lo, xmm_alpha_hi;
1949 __m128i xmm_src_lo, xmm_src_hi;
1950 __m128i xmm_dst_lo, xmm_dst_hi;
1951 __m128i xmm_mask_lo, xmm_mask_hi;
1953 /* call prefetch hint to optimize cache load*/
1954 cache_prefetch ((__m128i*)ps);
1955 cache_prefetch ((__m128i*)pd);
1956 cache_prefetch ((__m128i*)pm);
1958 while (w && (unsigned long)pd & 15)
1964 *pd++ = pack_1x64_32 (
1967 pix_multiply_1x64 (unpack_32_1x64 (m),
1968 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1972 /* call prefetch hint to optimize cache load*/
1973 cache_prefetch ((__m128i*)ps);
1974 cache_prefetch ((__m128i*)pd);
1975 cache_prefetch ((__m128i*)pm);
1979 /* fill cache line with next memory */
1980 cache_prefetch_next ((__m128i*)ps);
1981 cache_prefetch_next ((__m128i*)pd);
1982 cache_prefetch_next ((__m128i*)pm);
1984 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1985 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1986 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1988 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1989 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1990 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1992 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1993 &xmm_alpha_lo, &xmm_alpha_hi);
1994 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1995 &xmm_alpha_lo, &xmm_alpha_hi,
1996 &xmm_alpha_lo, &xmm_alpha_hi);
1998 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1999 &xmm_alpha_lo, &xmm_alpha_hi,
2000 &xmm_dst_lo, &xmm_dst_hi);
2003 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2017 *pd++ = pack_1x64_32 (
2020 pix_multiply_1x64 (unpack_32_1x64 (m),
2021 expand_alpha_1x64 (unpack_32_1x64 (s)))));
2026 static force_inline void
2027 core_combine_out_ca_sse2 (uint32_t * pd,
2034 __m128i xmm_alpha_lo, xmm_alpha_hi;
2035 __m128i xmm_src_lo, xmm_src_hi;
2036 __m128i xmm_dst_lo, xmm_dst_hi;
2037 __m128i xmm_mask_lo, xmm_mask_hi;
2039 /* call prefetch hint to optimize cache load*/
2040 cache_prefetch ((__m128i*)ps);
2041 cache_prefetch ((__m128i*)pd);
2042 cache_prefetch ((__m128i*)pm);
2044 while (w && (unsigned long)pd & 15)
2050 *pd++ = pack_1x64_32 (
2053 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2054 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2058 /* call prefetch hint to optimize cache load*/
2059 cache_prefetch ((__m128i*)ps);
2060 cache_prefetch ((__m128i*)pd);
2061 cache_prefetch ((__m128i*)pm);
2065 /* fill cache line with next memory */
2066 cache_prefetch_next ((__m128i*)ps);
2067 cache_prefetch_next ((__m128i*)pd);
2068 cache_prefetch_next ((__m128i*)pm);
2070 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2071 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2072 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2074 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2075 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2076 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2078 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2079 &xmm_alpha_lo, &xmm_alpha_hi);
2080 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2081 &xmm_alpha_lo, &xmm_alpha_hi);
2083 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2084 &xmm_mask_lo, &xmm_mask_hi,
2085 &xmm_dst_lo, &xmm_dst_hi);
2086 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2087 &xmm_alpha_lo, &xmm_alpha_hi,
2088 &xmm_dst_lo, &xmm_dst_hi);
2091 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2105 *pd++ = pack_1x64_32 (
2108 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2109 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2115 static force_inline void
2116 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
2123 __m128i xmm_alpha_lo, xmm_alpha_hi;
2124 __m128i xmm_src_lo, xmm_src_hi;
2125 __m128i xmm_dst_lo, xmm_dst_hi;
2126 __m128i xmm_mask_lo, xmm_mask_hi;
2128 /* call prefetch hint to optimize cache load*/
2129 cache_prefetch ((__m128i*)ps);
2130 cache_prefetch ((__m128i*)pd);
2131 cache_prefetch ((__m128i*)pm);
2133 while (w && (unsigned long)pd & 15)
2139 *pd++ = pack_1x64_32 (
2142 negate_1x64 (pix_multiply_1x64 (
2144 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2148 /* call prefetch hint to optimize cache load*/
2149 cache_prefetch ((__m128i*)ps);
2150 cache_prefetch ((__m128i*)pd);
2151 cache_prefetch ((__m128i*)pm);
2155 /* fill cache line with next memory */
2156 cache_prefetch_next ((__m128i*)ps);
2157 cache_prefetch_next ((__m128i*)pd);
2158 cache_prefetch_next ((__m128i*)pm);
2160 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2164 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2168 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169 &xmm_alpha_lo, &xmm_alpha_hi);
2171 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2172 &xmm_alpha_lo, &xmm_alpha_hi,
2173 &xmm_mask_lo, &xmm_mask_hi);
2175 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2176 &xmm_mask_lo, &xmm_mask_hi);
2178 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2179 &xmm_mask_lo, &xmm_mask_hi,
2180 &xmm_dst_lo, &xmm_dst_hi);
2183 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2197 *pd++ = pack_1x64_32 (
2200 negate_1x64 (pix_multiply_1x64 (
2202 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2207 static force_inline uint32_t
2208 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2212 __m64 m = unpack_32_1x64 (mask);
2213 __m64 s = unpack_32_1x64 (src);
2214 __m64 d = unpack_32_1x64 (dst);
2215 __m64 sa = expand_alpha_1x64 (s);
2216 __m64 da = expand_alpha_1x64 (d);
2218 s = pix_multiply_1x64 (s, m);
2219 m = negate_1x64 (pix_multiply_1x64 (m, sa));
2221 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2224 static force_inline void
2225 core_combine_atop_ca_sse2 (uint32_t * pd,
2232 __m128i xmm_src_lo, xmm_src_hi;
2233 __m128i xmm_dst_lo, xmm_dst_hi;
2234 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2235 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2236 __m128i xmm_mask_lo, xmm_mask_hi;
2238 /* call prefetch hint to optimize cache load*/
2239 cache_prefetch ((__m128i*)ps);
2240 cache_prefetch ((__m128i*)pd);
2241 cache_prefetch ((__m128i*)pm);
2243 while (w && (unsigned long)pd & 15)
2249 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2253 /* call prefetch hint to optimize cache load*/
2254 cache_prefetch ((__m128i*)ps);
2255 cache_prefetch ((__m128i*)pd);
2256 cache_prefetch ((__m128i*)pm);
2260 /* fill cache line with next memory */
2261 cache_prefetch_next ((__m128i*)ps);
2262 cache_prefetch_next ((__m128i*)pd);
2263 cache_prefetch_next ((__m128i*)pm);
2265 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2266 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2267 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2269 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2270 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2271 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2273 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2274 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2275 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2276 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2278 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2279 &xmm_mask_lo, &xmm_mask_hi,
2280 &xmm_src_lo, &xmm_src_hi);
2281 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2282 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2283 &xmm_mask_lo, &xmm_mask_hi);
2285 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2287 pix_add_multiply_2x128 (
2288 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2289 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2290 &xmm_dst_lo, &xmm_dst_hi);
2293 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2307 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2312 static force_inline uint32_t
2313 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2317 __m64 m = unpack_32_1x64 (mask);
2318 __m64 s = unpack_32_1x64 (src);
2319 __m64 d = unpack_32_1x64 (dst);
2321 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2322 __m64 sa = expand_alpha_1x64 (s);
2324 s = pix_multiply_1x64 (s, m);
2325 m = pix_multiply_1x64 (m, sa);
2327 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2330 static force_inline void
2331 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2338 __m128i xmm_src_lo, xmm_src_hi;
2339 __m128i xmm_dst_lo, xmm_dst_hi;
2340 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2341 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2342 __m128i xmm_mask_lo, xmm_mask_hi;
2344 /* call prefetch hint to optimize cache load*/
2345 cache_prefetch ((__m128i*)ps);
2346 cache_prefetch ((__m128i*)pd);
2347 cache_prefetch ((__m128i*)pm);
2349 while (w && (unsigned long)pd & 15)
2355 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2359 /* call prefetch hint to optimize cache load*/
2360 cache_prefetch ((__m128i*)ps);
2361 cache_prefetch ((__m128i*)pd);
2362 cache_prefetch ((__m128i*)pm);
2366 /* fill cache line with next memory */
2367 cache_prefetch_next ((__m128i*)ps);
2368 cache_prefetch_next ((__m128i*)pd);
2369 cache_prefetch_next ((__m128i*)pm);
2371 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2372 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2373 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2375 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2376 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2377 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2379 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2380 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2381 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2382 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2384 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2385 &xmm_mask_lo, &xmm_mask_hi,
2386 &xmm_src_lo, &xmm_src_hi);
2387 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2388 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2389 &xmm_mask_lo, &xmm_mask_hi);
2391 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2392 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2394 pix_add_multiply_2x128 (
2395 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2396 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2397 &xmm_dst_lo, &xmm_dst_hi);
2400 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2414 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2419 static force_inline uint32_t
2420 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2424 __m64 a = unpack_32_1x64 (mask);
2425 __m64 s = unpack_32_1x64 (src);
2426 __m64 d = unpack_32_1x64 (dst);
2428 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2429 a, expand_alpha_1x64 (s)));
2430 __m64 dest = pix_multiply_1x64 (s, a);
2431 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2433 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2439 static force_inline void
2440 core_combine_xor_ca_sse2 (uint32_t * pd,
2447 __m128i xmm_src_lo, xmm_src_hi;
2448 __m128i xmm_dst_lo, xmm_dst_hi;
2449 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2450 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2451 __m128i xmm_mask_lo, xmm_mask_hi;
2453 /* call prefetch hint to optimize cache load*/
2454 cache_prefetch ((__m128i*)ps);
2455 cache_prefetch ((__m128i*)pd);
2456 cache_prefetch ((__m128i*)pm);
2458 while (w && (unsigned long)pd & 15)
2464 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2468 /* call prefetch hint to optimize cache load*/
2469 cache_prefetch ((__m128i*)ps);
2470 cache_prefetch ((__m128i*)pd);
2471 cache_prefetch ((__m128i*)pm);
2475 /* fill cache line with next memory */
2476 cache_prefetch_next ((__m128i*)ps);
2477 cache_prefetch_next ((__m128i*)pd);
2478 cache_prefetch_next ((__m128i*)pm);
2480 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2481 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2482 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2484 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2485 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2486 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2488 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2489 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2490 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2491 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2493 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2494 &xmm_mask_lo, &xmm_mask_hi,
2495 &xmm_src_lo, &xmm_src_hi);
2496 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2497 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2498 &xmm_mask_lo, &xmm_mask_hi);
2500 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2501 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2502 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2503 &xmm_mask_lo, &xmm_mask_hi);
2505 pix_add_multiply_2x128 (
2506 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2507 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2508 &xmm_dst_lo, &xmm_dst_hi);
2511 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2525 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2530 static force_inline void
2531 core_combine_add_ca_sse2 (uint32_t * pd,
2538 __m128i xmm_src_lo, xmm_src_hi;
2539 __m128i xmm_dst_lo, xmm_dst_hi;
2540 __m128i xmm_mask_lo, xmm_mask_hi;
2542 /* call prefetch hint to optimize cache load*/
2543 cache_prefetch ((__m128i*)ps);
2544 cache_prefetch ((__m128i*)pd);
2545 cache_prefetch ((__m128i*)pm);
2547 while (w && (unsigned long)pd & 15)
2553 *pd++ = pack_1x64_32 (
2554 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2555 unpack_32_1x64 (m)),
2556 unpack_32_1x64 (d)));
2560 /* call prefetch hint to optimize cache load*/
2561 cache_prefetch ((__m128i*)ps);
2562 cache_prefetch ((__m128i*)pd);
2563 cache_prefetch ((__m128i*)pm);
2567 /* fill cache line with next memory */
2568 cache_prefetch_next ((__m128i*)ps);
2569 cache_prefetch_next ((__m128i*)pd);
2570 cache_prefetch_next ((__m128i*)pm);
2572 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2573 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2574 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2576 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2577 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2578 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2580 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2581 &xmm_mask_lo, &xmm_mask_hi,
2582 &xmm_src_lo, &xmm_src_hi);
2585 (__m128i*)pd, pack_2x128_128 (
2586 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2587 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2601 *pd++ = pack_1x64_32 (
2602 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2603 unpack_32_1x64 (m)),
2604 unpack_32_1x64 (d)));
2609 /* ---------------------------------------------------
2610 * fb_compose_setup_sSE2
2612 static force_inline __m64
2613 create_mask_16_64 (uint16_t mask)
2615 return _mm_set1_pi16 (mask);
2618 static force_inline __m128i
2619 create_mask_16_128 (uint16_t mask)
2621 return _mm_set1_epi16 (mask);
2624 static force_inline __m64
2625 create_mask_2x32_64 (uint32_t mask0,
2628 return _mm_set_pi32 (mask0, mask1);
2631 static force_inline __m128i
2632 create_mask_2x32_128 (uint32_t mask0,
2635 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2638 /* SSE2 code patch for fbcompose.c */
2641 sse2_combine_over_u (pixman_implementation_t *imp,
2644 const uint32_t * src,
2645 const uint32_t * mask,
2648 core_combine_over_u_sse2 (dst, src, mask, width);
2653 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2656 const uint32_t * src,
2657 const uint32_t * mask,
2660 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2665 sse2_combine_in_u (pixman_implementation_t *imp,
2668 const uint32_t * src,
2669 const uint32_t * mask,
2672 core_combine_in_u_sse2 (dst, src, mask, width);
2677 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2680 const uint32_t * src,
2681 const uint32_t * mask,
2684 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2689 sse2_combine_out_u (pixman_implementation_t *imp,
2692 const uint32_t * src,
2693 const uint32_t * mask,
2696 core_combine_out_u_sse2 (dst, src, mask, width);
2701 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2704 const uint32_t * src,
2705 const uint32_t * mask,
2708 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2713 sse2_combine_atop_u (pixman_implementation_t *imp,
2716 const uint32_t * src,
2717 const uint32_t * mask,
2720 core_combine_atop_u_sse2 (dst, src, mask, width);
2725 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2728 const uint32_t * src,
2729 const uint32_t * mask,
2732 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2737 sse2_combine_xor_u (pixman_implementation_t *imp,
2740 const uint32_t * src,
2741 const uint32_t * mask,
2744 core_combine_xor_u_sse2 (dst, src, mask, width);
2749 sse2_combine_add_u (pixman_implementation_t *imp,
2752 const uint32_t * src,
2753 const uint32_t * mask,
2756 core_combine_add_u_sse2 (dst, src, mask, width);
2761 sse2_combine_saturate_u (pixman_implementation_t *imp,
2764 const uint32_t * src,
2765 const uint32_t * mask,
2768 core_combine_saturate_u_sse2 (dst, src, mask, width);
2773 sse2_combine_src_ca (pixman_implementation_t *imp,
2776 const uint32_t * src,
2777 const uint32_t * mask,
2780 core_combine_src_ca_sse2 (dst, src, mask, width);
2785 sse2_combine_over_ca (pixman_implementation_t *imp,
2788 const uint32_t * src,
2789 const uint32_t * mask,
2792 core_combine_over_ca_sse2 (dst, src, mask, width);
2797 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2800 const uint32_t * src,
2801 const uint32_t * mask,
2804 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2809 sse2_combine_in_ca (pixman_implementation_t *imp,
2812 const uint32_t * src,
2813 const uint32_t * mask,
2816 core_combine_in_ca_sse2 (dst, src, mask, width);
2821 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2824 const uint32_t * src,
2825 const uint32_t * mask,
2828 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2833 sse2_combine_out_ca (pixman_implementation_t *imp,
2836 const uint32_t * src,
2837 const uint32_t * mask,
2840 core_combine_out_ca_sse2 (dst, src, mask, width);
2845 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2848 const uint32_t * src,
2849 const uint32_t * mask,
2852 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2857 sse2_combine_atop_ca (pixman_implementation_t *imp,
2860 const uint32_t * src,
2861 const uint32_t * mask,
2864 core_combine_atop_ca_sse2 (dst, src, mask, width);
2869 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2872 const uint32_t * src,
2873 const uint32_t * mask,
2876 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2881 sse2_combine_xor_ca (pixman_implementation_t *imp,
2884 const uint32_t * src,
2885 const uint32_t * mask,
2888 core_combine_xor_ca_sse2 (dst, src, mask, width);
2893 sse2_combine_add_ca (pixman_implementation_t *imp,
2896 const uint32_t * src,
2897 const uint32_t * mask,
2900 core_combine_add_ca_sse2 (dst, src, mask, width);
2904 /* -------------------------------------------------------------------
2905 * composite_over_n_8888
2909 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2911 pixman_image_t * src_image,
2912 pixman_image_t * mask_image,
2913 pixman_image_t * dst_image,
2924 uint32_t *dst_line, *dst, d;
2927 __m128i xmm_src, xmm_alpha;
2928 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2930 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2935 PIXMAN_IMAGE_GET_LINE (
2936 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2938 xmm_src = expand_pixel_32_1x128 (src);
2939 xmm_alpha = expand_alpha_1x128 (xmm_src);
2945 /* call prefetch hint to optimize cache load*/
2946 cache_prefetch ((__m128i*)dst);
2948 dst_line += dst_stride;
2951 while (w && (unsigned long)dst & 15)
2954 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2955 _mm_movepi64_pi64 (xmm_alpha),
2956 unpack_32_1x64 (d)));
2960 cache_prefetch ((__m128i*)dst);
2964 /* fill cache line with next memory */
2965 cache_prefetch_next ((__m128i*)dst);
2967 xmm_dst = load_128_aligned ((__m128i*)dst);
2969 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2971 over_2x128 (&xmm_src, &xmm_src,
2972 &xmm_alpha, &xmm_alpha,
2973 &xmm_dst_lo, &xmm_dst_hi);
2975 /* rebuid the 4 pixel data and save*/
2977 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2986 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2987 _mm_movepi64_pi64 (xmm_alpha),
2988 unpack_32_1x64 (d)));
2996 /* ---------------------------------------------------------------------
2997 * composite_over_n_0565
3000 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3002 pixman_image_t * src_image,
3003 pixman_image_t * mask_image,
3004 pixman_image_t * dst_image,
3015 uint16_t *dst_line, *dst, d;
3018 __m128i xmm_src, xmm_alpha;
3019 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3021 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3026 PIXMAN_IMAGE_GET_LINE (
3027 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3029 xmm_src = expand_pixel_32_1x128 (src);
3030 xmm_alpha = expand_alpha_1x128 (xmm_src);
3036 /* call prefetch hint to optimize cache load*/
3037 cache_prefetch ((__m128i*)dst);
3039 dst_line += dst_stride;
3042 while (w && (unsigned long)dst & 15)
3046 *dst++ = pack_565_32_16 (
3047 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3048 _mm_movepi64_pi64 (xmm_alpha),
3049 expand565_16_1x64 (d))));
3053 /* call prefetch hint to optimize cache load*/
3054 cache_prefetch ((__m128i*)dst);
3058 /* fill cache line with next memory */
3059 cache_prefetch_next ((__m128i*)dst);
3061 xmm_dst = load_128_aligned ((__m128i*)dst);
3063 unpack_565_128_4x128 (xmm_dst,
3064 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3066 over_2x128 (&xmm_src, &xmm_src,
3067 &xmm_alpha, &xmm_alpha,
3068 &xmm_dst0, &xmm_dst1);
3069 over_2x128 (&xmm_src, &xmm_src,
3070 &xmm_alpha, &xmm_alpha,
3071 &xmm_dst2, &xmm_dst3);
3073 xmm_dst = pack_565_4x128_128 (
3074 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3076 save_128_aligned ((__m128i*)dst, xmm_dst);
3085 *dst++ = pack_565_32_16 (
3086 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3087 _mm_movepi64_pi64 (xmm_alpha),
3088 expand565_16_1x64 (d))));
3095 /* ------------------------------
3096 * composite_add_n_8888_8888_ca
3099 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3101 pixman_image_t * src_image,
3102 pixman_image_t * mask_image,
3103 pixman_image_t * dst_image,
3114 uint32_t *dst_line, d;
3115 uint32_t *mask_line, m;
3117 int dst_stride, mask_stride;
3119 __m128i xmm_src, xmm_alpha;
3121 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3123 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3125 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3131 PIXMAN_IMAGE_GET_LINE (
3132 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3133 PIXMAN_IMAGE_GET_LINE (
3134 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3136 xmm_src = _mm_unpacklo_epi8 (
3137 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3138 xmm_alpha = expand_alpha_1x128 (xmm_src);
3139 mmx_src = _mm_movepi64_pi64 (xmm_src);
3140 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3145 const uint32_t *pm = (uint32_t *)mask_line;
3146 uint32_t *pd = (uint32_t *)dst_line;
3148 dst_line += dst_stride;
3149 mask_line += mask_stride;
3151 /* call prefetch hint to optimize cache load*/
3152 cache_prefetch ((__m128i*)pd);
3153 cache_prefetch ((__m128i*)pm);
3155 while (w && (unsigned long)pd & 15)
3163 mmx_mask = unpack_32_1x64 (m);
3164 mmx_dest = unpack_32_1x64 (d);
3166 *pd = pack_1x64_32 (
3167 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3174 /* call prefetch hint to optimize cache load*/
3175 cache_prefetch ((__m128i*)pd);
3176 cache_prefetch ((__m128i*)pm);
3180 /* fill cache line with next memory */
3181 cache_prefetch_next ((__m128i*)pd);
3182 cache_prefetch_next ((__m128i*)pm);
3184 xmm_mask = load_128_unaligned ((__m128i*)pm);
3188 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3190 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3191 if (pack_cmp != 0xffff)
3193 xmm_dst = load_128_aligned ((__m128i*)pd);
3195 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3197 pix_multiply_2x128 (&xmm_src, &xmm_src,
3198 &xmm_mask_lo, &xmm_mask_hi,
3199 &xmm_mask_lo, &xmm_mask_hi);
3200 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3203 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3219 mmx_mask = unpack_32_1x64 (m);
3220 mmx_dest = unpack_32_1x64 (d);
3222 *pd = pack_1x64_32 (
3223 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3234 /* ---------------------------------------------------------------------------
3235 * composite_over_n_8888_8888_ca
3239 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3241 pixman_image_t * src_image,
3242 pixman_image_t * mask_image,
3243 pixman_image_t * dst_image,
3254 uint32_t *dst_line, d;
3255 uint32_t *mask_line, m;
3257 int dst_stride, mask_stride;
3259 __m128i xmm_src, xmm_alpha;
3260 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3261 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3263 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3265 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3270 PIXMAN_IMAGE_GET_LINE (
3271 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3272 PIXMAN_IMAGE_GET_LINE (
3273 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3275 xmm_src = _mm_unpacklo_epi8 (
3276 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3277 xmm_alpha = expand_alpha_1x128 (xmm_src);
3278 mmx_src = _mm_movepi64_pi64 (xmm_src);
3279 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3284 const uint32_t *pm = (uint32_t *)mask_line;
3285 uint32_t *pd = (uint32_t *)dst_line;
3287 dst_line += dst_stride;
3288 mask_line += mask_stride;
3290 /* call prefetch hint to optimize cache load*/
3291 cache_prefetch ((__m128i*)pd);
3292 cache_prefetch ((__m128i*)pm);
3294 while (w && (unsigned long)pd & 15)
3301 mmx_mask = unpack_32_1x64 (m);
3302 mmx_dest = unpack_32_1x64 (d);
3304 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3314 /* call prefetch hint to optimize cache load*/
3315 cache_prefetch ((__m128i*)pd);
3316 cache_prefetch ((__m128i*)pm);
3320 /* fill cache line with next memory */
3321 cache_prefetch_next ((__m128i*)pd);
3322 cache_prefetch_next ((__m128i*)pm);
3324 xmm_mask = load_128_unaligned ((__m128i*)pm);
3328 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3330 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3331 if (pack_cmp != 0xffff)
3333 xmm_dst = load_128_aligned ((__m128i*)pd);
3335 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3336 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3338 in_over_2x128 (&xmm_src, &xmm_src,
3339 &xmm_alpha, &xmm_alpha,
3340 &xmm_mask_lo, &xmm_mask_hi,
3341 &xmm_dst_lo, &xmm_dst_hi);
3344 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3359 mmx_mask = unpack_32_1x64 (m);
3360 mmx_dest = unpack_32_1x64 (d);
3362 *pd = pack_1x64_32 (
3363 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3374 /*---------------------------------------------------------------------
3375 * composite_over_8888_n_8888
3379 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3381 pixman_image_t * src_image,
3382 pixman_image_t * mask_image,
3383 pixman_image_t * dst_image,
3393 uint32_t *dst_line, *dst;
3394 uint32_t *src_line, *src;
3397 int dst_stride, src_stride;
3400 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3401 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3402 __m128i xmm_alpha_lo, xmm_alpha_hi;
3404 PIXMAN_IMAGE_GET_LINE (
3405 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3406 PIXMAN_IMAGE_GET_LINE (
3407 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3409 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3411 xmm_mask = create_mask_16_128 (mask >> 24);
3416 dst_line += dst_stride;
3418 src_line += src_stride;
3421 /* call prefetch hint to optimize cache load*/
3422 cache_prefetch ((__m128i*)dst);
3423 cache_prefetch ((__m128i*)src);
3425 while (w && (unsigned long)dst & 15)
3427 uint32_t s = *src++;
3430 __m64 ms = unpack_32_1x64 (s);
3431 __m64 alpha = expand_alpha_1x64 (ms);
3432 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3433 __m64 alpha_dst = unpack_32_1x64 (d);
3435 *dst++ = pack_1x64_32 (
3436 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3441 /* call prefetch hint to optimize cache load*/
3442 cache_prefetch ((__m128i*)dst);
3443 cache_prefetch ((__m128i*)src);
3447 /* fill cache line with next memory */
3448 cache_prefetch_next ((__m128i*)dst);
3449 cache_prefetch_next ((__m128i*)src);
3451 xmm_src = load_128_unaligned ((__m128i*)src);
3452 xmm_dst = load_128_aligned ((__m128i*)dst);
3454 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3455 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3456 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3457 &xmm_alpha_lo, &xmm_alpha_hi);
3459 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3460 &xmm_alpha_lo, &xmm_alpha_hi,
3461 &xmm_mask, &xmm_mask,
3462 &xmm_dst_lo, &xmm_dst_hi);
3465 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3474 uint32_t s = *src++;
3477 __m64 ms = unpack_32_1x64 (s);
3478 __m64 alpha = expand_alpha_1x64 (ms);
3479 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3480 __m64 dest = unpack_32_1x64 (d);
3482 *dst++ = pack_1x64_32 (
3483 in_over_1x64 (&ms, &alpha, &mask, &dest));
3492 /* ---------------------------------------------------------------------
3493 * composite_over_x888_n_8888
3496 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3498 pixman_image_t * src_image,
3499 pixman_image_t * mask_image,
3500 pixman_image_t * dst_image,
3510 uint32_t *dst_line, *dst;
3511 uint32_t *src_line, *src;
3513 int dst_stride, src_stride;
3516 __m128i xmm_mask, xmm_alpha;
3517 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3518 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3520 PIXMAN_IMAGE_GET_LINE (
3521 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3522 PIXMAN_IMAGE_GET_LINE (
3523 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3525 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3527 xmm_mask = create_mask_16_128 (mask >> 24);
3528 xmm_alpha = mask_00ff;
3533 dst_line += dst_stride;
3535 src_line += src_stride;
3538 /* call prefetch hint to optimize cache load*/
3539 cache_prefetch ((__m128i*)dst);
3540 cache_prefetch ((__m128i*)src);
3542 while (w && (unsigned long)dst & 15)
3544 uint32_t s = (*src++) | 0xff000000;
3547 __m64 src = unpack_32_1x64 (s);
3548 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3549 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3550 __m64 dest = unpack_32_1x64 (d);
3552 *dst++ = pack_1x64_32 (
3553 in_over_1x64 (&src, &alpha, &mask, &dest));
3558 /* call prefetch hint to optimize cache load*/
3559 cache_prefetch ((__m128i*)dst);
3560 cache_prefetch ((__m128i*)src);
3564 /* fill cache line with next memory */
3565 cache_prefetch_next ((__m128i*)dst);
3566 cache_prefetch_next ((__m128i*)src);
3568 xmm_src = _mm_or_si128 (
3569 load_128_unaligned ((__m128i*)src), mask_ff000000);
3570 xmm_dst = load_128_aligned ((__m128i*)dst);
3572 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3573 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3575 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3576 &xmm_alpha, &xmm_alpha,
3577 &xmm_mask, &xmm_mask,
3578 &xmm_dst_lo, &xmm_dst_hi);
3581 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3591 uint32_t s = (*src++) | 0xff000000;
3594 __m64 src = unpack_32_1x64 (s);
3595 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3596 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3597 __m64 dest = unpack_32_1x64 (d);
3599 *dst++ = pack_1x64_32 (
3600 in_over_1x64 (&src, &alpha, &mask, &dest));
3609 /* --------------------------------------------------------------------
3610 * composite_over_8888_8888
3613 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3615 pixman_image_t * src_image,
3616 pixman_image_t * mask_image,
3617 pixman_image_t * dst_image,
3627 int dst_stride, src_stride;
3628 uint32_t *dst_line, *dst;
3629 uint32_t *src_line, *src;
3631 PIXMAN_IMAGE_GET_LINE (
3632 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3633 PIXMAN_IMAGE_GET_LINE (
3634 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3641 core_combine_over_u_sse2 (dst, src, NULL, width);
3649 /* ------------------------------------------------------------------
3650 * composite_over_8888_0565
3652 static force_inline uint16_t
3653 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3657 ms = unpack_32_1x64 (src);
3658 return pack_565_32_16 (
3661 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3665 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3667 pixman_image_t * src_image,
3668 pixman_image_t * mask_image,
3669 pixman_image_t * dst_image,
3679 uint16_t *dst_line, *dst, d;
3680 uint32_t *src_line, *src, s;
3681 int dst_stride, src_stride;
3684 __m128i xmm_alpha_lo, xmm_alpha_hi;
3685 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3686 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3688 PIXMAN_IMAGE_GET_LINE (
3689 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3690 PIXMAN_IMAGE_GET_LINE (
3691 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3696 * I copy the code from MMX one and keep the fixme.
3697 * If it's a problem there, probably is a problem here.
3699 assert (src_image->drawable == mask_image->drawable);
3707 /* call prefetch hint to optimize cache load*/
3708 cache_prefetch ((__m128i*)src);
3709 cache_prefetch ((__m128i*)dst);
3711 dst_line += dst_stride;
3712 src_line += src_stride;
3715 /* Align dst on a 16-byte boundary */
3717 ((unsigned long)dst & 15))
3722 *dst++ = composite_over_8888_0565pixel (s, d);
3726 /* call prefetch hint to optimize cache load*/
3727 cache_prefetch ((__m128i*)src);
3728 cache_prefetch ((__m128i*)dst);
3730 /* It's a 8 pixel loop */
3733 /* fill cache line with next memory */
3734 cache_prefetch_next ((__m128i*)src);
3735 cache_prefetch_next ((__m128i*)dst);
3737 /* I'm loading unaligned because I'm not sure
3738 * about the address alignment.
3740 xmm_src = load_128_unaligned ((__m128i*) src);
3741 xmm_dst = load_128_aligned ((__m128i*) dst);
3744 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3745 unpack_565_128_4x128 (xmm_dst,
3746 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3747 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3748 &xmm_alpha_lo, &xmm_alpha_hi);
3750 /* I'm loading next 4 pixels from memory
3751 * before to optimze the memory read.
3753 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3755 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3756 &xmm_alpha_lo, &xmm_alpha_hi,
3757 &xmm_dst0, &xmm_dst1);
3760 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3761 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3762 &xmm_alpha_lo, &xmm_alpha_hi);
3764 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3765 &xmm_alpha_lo, &xmm_alpha_hi,
3766 &xmm_dst2, &xmm_dst3);
3769 (__m128i*)dst, pack_565_4x128_128 (
3770 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3782 *dst++ = composite_over_8888_0565pixel (s, d);
3789 /* -----------------------------------------------------------------
3790 * composite_over_n_8_8888
3794 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3796 pixman_image_t * src_image,
3797 pixman_image_t * mask_image,
3798 pixman_image_t * dst_image,
3809 uint32_t *dst_line, *dst;
3810 uint8_t *mask_line, *mask;
3811 int dst_stride, mask_stride;
3815 __m128i xmm_src, xmm_alpha, xmm_def;
3816 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3817 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3819 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3821 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3827 PIXMAN_IMAGE_GET_LINE (
3828 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3829 PIXMAN_IMAGE_GET_LINE (
3830 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3832 xmm_def = create_mask_2x32_128 (src, src);
3833 xmm_src = expand_pixel_32_1x128 (src);
3834 xmm_alpha = expand_alpha_1x128 (xmm_src);
3835 mmx_src = _mm_movepi64_pi64 (xmm_src);
3836 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3841 dst_line += dst_stride;
3843 mask_line += mask_stride;
3846 /* call prefetch hint to optimize cache load*/
3847 cache_prefetch ((__m128i*)mask);
3848 cache_prefetch ((__m128i*)dst);
3850 while (w && (unsigned long)dst & 15)
3852 uint8_t m = *mask++;
3857 mmx_mask = expand_pixel_8_1x64 (m);
3858 mmx_dest = unpack_32_1x64 (d);
3860 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3870 /* call prefetch hint to optimize cache load*/
3871 cache_prefetch ((__m128i*)mask);
3872 cache_prefetch ((__m128i*)dst);
3876 /* fill cache line with next memory */
3877 cache_prefetch_next ((__m128i*)mask);
3878 cache_prefetch_next ((__m128i*)dst);
3880 m = *((uint32_t*)mask);
3882 if (srca == 0xff && m == 0xffffffff)
3884 save_128_aligned ((__m128i*)dst, xmm_def);
3888 xmm_dst = load_128_aligned ((__m128i*) dst);
3889 xmm_mask = unpack_32_1x128 (m);
3890 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3893 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3894 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3896 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3897 &xmm_mask_lo, &xmm_mask_hi);
3899 in_over_2x128 (&xmm_src, &xmm_src,
3900 &xmm_alpha, &xmm_alpha,
3901 &xmm_mask_lo, &xmm_mask_hi,
3902 &xmm_dst_lo, &xmm_dst_hi);
3905 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3915 uint8_t m = *mask++;
3920 mmx_mask = expand_pixel_8_1x64 (m);
3921 mmx_dest = unpack_32_1x64 (d);
3923 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3937 /* ----------------------------------------------------------------
3938 * composite_over_n_8_8888
3942 pixman_fill_sse2 (uint32_t *bits,
3951 uint32_t byte_width;
3956 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3959 if (bpp != 16 && bpp != 32)
3964 stride = stride * (int) sizeof (uint32_t) / 2;
3965 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3966 byte_width = 2 * width;
3971 stride = stride * (int) sizeof (uint32_t) / 4;
3972 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3973 byte_width = 4 * width;
3977 cache_prefetch ((__m128i*)byte_line);
3978 xmm_def = create_mask_2x32_128 (data, data);
3983 uint8_t *d = byte_line;
3984 byte_line += stride;
3988 cache_prefetch_next ((__m128i*)d);
3990 while (w >= 2 && ((unsigned long)d & 3))
3992 *(uint16_t *)d = data;
3997 while (w >= 4 && ((unsigned long)d & 15))
3999 *(uint32_t *)d = data;
4005 cache_prefetch_next ((__m128i*)d);
4009 cache_prefetch (((__m128i*)d) + 12);
4011 save_128_aligned ((__m128i*)(d), xmm_def);
4012 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4013 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4014 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4015 save_128_aligned ((__m128i*)(d + 64), xmm_def);
4016 save_128_aligned ((__m128i*)(d + 80), xmm_def);
4017 save_128_aligned ((__m128i*)(d + 96), xmm_def);
4018 save_128_aligned ((__m128i*)(d + 112), xmm_def);
4026 cache_prefetch (((__m128i*)d) + 8);
4028 save_128_aligned ((__m128i*)(d), xmm_def);
4029 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4030 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4031 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4037 cache_prefetch_next ((__m128i*)d);
4041 save_128_aligned ((__m128i*)(d), xmm_def);
4042 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4050 save_128_aligned ((__m128i*)(d), xmm_def);
4056 cache_prefetch_next ((__m128i*)d);
4060 *(uint32_t *)d = data;
4068 *(uint16_t *)d = data;
4079 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4081 pixman_image_t * src_image,
4082 pixman_image_t * mask_image,
4083 pixman_image_t * dst_image,
4094 uint32_t *dst_line, *dst;
4095 uint8_t *mask_line, *mask;
4096 int dst_stride, mask_stride;
4100 __m128i xmm_src, xmm_def;
4101 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4103 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4108 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4109 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4110 dest_x, dest_y, width, height, 0);
4114 PIXMAN_IMAGE_GET_LINE (
4115 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4116 PIXMAN_IMAGE_GET_LINE (
4117 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4119 xmm_def = create_mask_2x32_128 (src, src);
4120 xmm_src = expand_pixel_32_1x128 (src);
4125 dst_line += dst_stride;
4127 mask_line += mask_stride;
4130 /* call prefetch hint to optimize cache load*/
4131 cache_prefetch ((__m128i*)mask);
4132 cache_prefetch ((__m128i*)dst);
4134 while (w && (unsigned long)dst & 15)
4136 uint8_t m = *mask++;
4140 *dst = pack_1x64_32 (
4142 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4153 /* call prefetch hint to optimize cache load*/
4154 cache_prefetch ((__m128i*)mask);
4155 cache_prefetch ((__m128i*)dst);
4159 /* fill cache line with next memory */
4160 cache_prefetch_next ((__m128i*)mask);
4161 cache_prefetch_next ((__m128i*)dst);
4163 m = *((uint32_t*)mask);
4165 if (srca == 0xff && m == 0xffffffff)
4167 save_128_aligned ((__m128i*)dst, xmm_def);
4171 xmm_mask = unpack_32_1x128 (m);
4172 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4175 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4177 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4178 &xmm_mask_lo, &xmm_mask_hi);
4180 pix_multiply_2x128 (&xmm_src, &xmm_src,
4181 &xmm_mask_lo, &xmm_mask_hi,
4182 &xmm_mask_lo, &xmm_mask_hi);
4185 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4189 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4199 uint8_t m = *mask++;
4203 *dst = pack_1x64_32 (
4205 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4220 /*-----------------------------------------------------------------------
4221 * composite_over_n_8_0565
4225 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4227 pixman_image_t * src_image,
4228 pixman_image_t * mask_image,
4229 pixman_image_t * dst_image,
4240 uint16_t *dst_line, *dst, d;
4241 uint8_t *mask_line, *mask;
4242 int dst_stride, mask_stride;
4245 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4247 __m128i xmm_src, xmm_alpha;
4248 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4249 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4251 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4257 PIXMAN_IMAGE_GET_LINE (
4258 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4259 PIXMAN_IMAGE_GET_LINE (
4260 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4262 xmm_src = expand_pixel_32_1x128 (src);
4263 xmm_alpha = expand_alpha_1x128 (xmm_src);
4264 mmx_src = _mm_movepi64_pi64 (xmm_src);
4265 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4270 dst_line += dst_stride;
4272 mask_line += mask_stride;
4275 /* call prefetch hint to optimize cache load*/
4276 cache_prefetch ((__m128i*)mask);
4277 cache_prefetch ((__m128i*)dst);
4279 while (w && (unsigned long)dst & 15)
4286 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4287 mmx_dest = expand565_16_1x64 (d);
4289 *dst = pack_565_32_16 (
4292 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4299 /* call prefetch hint to optimize cache load*/
4300 cache_prefetch ((__m128i*)mask);
4301 cache_prefetch ((__m128i*)dst);
4305 /* fill cache line with next memory */
4306 cache_prefetch_next ((__m128i*)mask);
4307 cache_prefetch_next ((__m128i*)dst);
4309 xmm_dst = load_128_aligned ((__m128i*) dst);
4310 unpack_565_128_4x128 (xmm_dst,
4311 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4313 m = *((uint32_t*)mask);
4318 xmm_mask = unpack_32_1x128 (m);
4319 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4322 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4324 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4325 &xmm_mask_lo, &xmm_mask_hi);
4327 in_over_2x128 (&xmm_src, &xmm_src,
4328 &xmm_alpha, &xmm_alpha,
4329 &xmm_mask_lo, &xmm_mask_hi,
4330 &xmm_dst0, &xmm_dst1);
4333 m = *((uint32_t*)mask);
4338 xmm_mask = unpack_32_1x128 (m);
4339 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4342 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4344 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4345 &xmm_mask_lo, &xmm_mask_hi);
4346 in_over_2x128 (&xmm_src, &xmm_src,
4347 &xmm_alpha, &xmm_alpha,
4348 &xmm_mask_lo, &xmm_mask_hi,
4349 &xmm_dst2, &xmm_dst3);
4353 (__m128i*)dst, pack_565_4x128_128 (
4354 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4367 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4368 mmx_dest = expand565_16_1x64 (d);
4370 *dst = pack_565_32_16 (
4373 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4384 /* -----------------------------------------------------------------------
4385 * composite_over_pixbuf_0565
4389 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4391 pixman_image_t * src_image,
4392 pixman_image_t * mask_image,
4393 pixman_image_t * dst_image,
4403 uint16_t *dst_line, *dst, d;
4404 uint32_t *src_line, *src, s;
4405 int dst_stride, src_stride;
4407 uint32_t opaque, zero;
4410 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4411 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4413 PIXMAN_IMAGE_GET_LINE (
4414 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4415 PIXMAN_IMAGE_GET_LINE (
4416 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4421 * I copy the code from MMX one and keep the fixme.
4422 * If it's a problem there, probably is a problem here.
4424 assert (src_image->drawable == mask_image->drawable);
4430 dst_line += dst_stride;
4432 src_line += src_stride;
4435 /* call prefetch hint to optimize cache load*/
4436 cache_prefetch ((__m128i*)src);
4437 cache_prefetch ((__m128i*)dst);
4439 while (w && (unsigned long)dst & 15)
4444 ms = unpack_32_1x64 (s);
4446 *dst++ = pack_565_32_16 (
4448 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4452 /* call prefetch hint to optimize cache load*/
4453 cache_prefetch ((__m128i*)src);
4454 cache_prefetch ((__m128i*)dst);
4458 /* fill cache line with next memory */
4459 cache_prefetch_next ((__m128i*)src);
4460 cache_prefetch_next ((__m128i*)dst);
4463 xmm_src = load_128_unaligned ((__m128i*)src);
4464 xmm_dst = load_128_aligned ((__m128i*)dst);
4466 opaque = is_opaque (xmm_src);
4467 zero = is_zero (xmm_src);
4469 unpack_565_128_4x128 (xmm_dst,
4470 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4471 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4473 /* preload next round*/
4474 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4478 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4479 &xmm_dst0, &xmm_dst1);
4483 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4484 &xmm_dst0, &xmm_dst1);
4488 opaque = is_opaque (xmm_src);
4489 zero = is_zero (xmm_src);
4491 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4495 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4496 &xmm_dst2, &xmm_dst3);
4500 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4501 &xmm_dst2, &xmm_dst3);
4505 (__m128i*)dst, pack_565_4x128_128 (
4506 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4518 ms = unpack_32_1x64 (s);
4520 *dst++ = pack_565_32_16 (
4522 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4530 /* -------------------------------------------------------------------------
4531 * composite_over_pixbuf_8888
4535 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4537 pixman_image_t * src_image,
4538 pixman_image_t * mask_image,
4539 pixman_image_t * dst_image,
4549 uint32_t *dst_line, *dst, d;
4550 uint32_t *src_line, *src, s;
4551 int dst_stride, src_stride;
4553 uint32_t opaque, zero;
4555 __m128i xmm_src_lo, xmm_src_hi;
4556 __m128i xmm_dst_lo, xmm_dst_hi;
4558 PIXMAN_IMAGE_GET_LINE (
4559 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4560 PIXMAN_IMAGE_GET_LINE (
4561 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4566 * I copy the code from MMX one and keep the fixme.
4567 * If it's a problem there, probably is a problem here.
4569 assert (src_image->drawable == mask_image->drawable);
4575 dst_line += dst_stride;
4577 src_line += src_stride;
4580 /* call prefetch hint to optimize cache load*/
4581 cache_prefetch ((__m128i*)src);
4582 cache_prefetch ((__m128i*)dst);
4584 while (w && (unsigned long)dst & 15)
4589 *dst++ = pack_1x64_32 (
4590 over_rev_non_pre_1x64 (
4591 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4596 /* call prefetch hint to optimize cache load*/
4597 cache_prefetch ((__m128i*)src);
4598 cache_prefetch ((__m128i*)dst);
4602 /* fill cache line with next memory */
4603 cache_prefetch_next ((__m128i*)src);
4604 cache_prefetch_next ((__m128i*)dst);
4606 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4608 opaque = is_opaque (xmm_src_hi);
4609 zero = is_zero (xmm_src_hi);
4611 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4615 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4616 &xmm_dst_lo, &xmm_dst_hi);
4619 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4623 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4625 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4627 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4628 &xmm_dst_lo, &xmm_dst_hi);
4631 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4644 *dst++ = pack_1x64_32 (
4645 over_rev_non_pre_1x64 (
4646 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4655 /* -------------------------------------------------------------------------------------------------
4656 * composite_over_n_8888_0565_ca
4660 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4662 pixman_image_t * src_image,
4663 pixman_image_t * mask_image,
4664 pixman_image_t * dst_image,
4675 uint16_t *dst_line, *dst, d;
4676 uint32_t *mask_line, *mask, m;
4677 int dst_stride, mask_stride;
4681 __m128i xmm_src, xmm_alpha;
4682 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4683 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4685 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4687 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4692 PIXMAN_IMAGE_GET_LINE (
4693 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4694 PIXMAN_IMAGE_GET_LINE (
4695 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4697 xmm_src = expand_pixel_32_1x128 (src);
4698 xmm_alpha = expand_alpha_1x128 (xmm_src);
4699 mmx_src = _mm_movepi64_pi64 (xmm_src);
4700 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4707 mask_line += mask_stride;
4708 dst_line += dst_stride;
4710 /* call prefetch hint to optimize cache load*/
4711 cache_prefetch ((__m128i*)mask);
4712 cache_prefetch ((__m128i*)dst);
4714 while (w && ((unsigned long)dst & 15))
4716 m = *(uint32_t *) mask;
4721 mmx_mask = unpack_32_1x64 (m);
4722 mmx_dest = expand565_16_1x64 (d);
4724 *dst = pack_565_32_16 (
4727 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4735 /* call prefetch hint to optimize cache load*/
4736 cache_prefetch ((__m128i*)mask);
4737 cache_prefetch ((__m128i*)dst);
4741 /* fill cache line with next memory */
4742 cache_prefetch_next ((__m128i*)mask);
4743 cache_prefetch_next ((__m128i*)dst);
4746 xmm_mask = load_128_unaligned ((__m128i*)mask);
4747 xmm_dst = load_128_aligned ((__m128i*)dst);
4749 pack_cmp = _mm_movemask_epi8 (
4750 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4752 unpack_565_128_4x128 (xmm_dst,
4753 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4754 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4756 /* preload next round */
4757 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4759 /* preload next round */
4760 if (pack_cmp != 0xffff)
4762 in_over_2x128 (&xmm_src, &xmm_src,
4763 &xmm_alpha, &xmm_alpha,
4764 &xmm_mask_lo, &xmm_mask_hi,
4765 &xmm_dst0, &xmm_dst1);
4769 pack_cmp = _mm_movemask_epi8 (
4770 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4772 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4774 if (pack_cmp != 0xffff)
4776 in_over_2x128 (&xmm_src, &xmm_src,
4777 &xmm_alpha, &xmm_alpha,
4778 &xmm_mask_lo, &xmm_mask_hi,
4779 &xmm_dst2, &xmm_dst3);
4783 (__m128i*)dst, pack_565_4x128_128 (
4784 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4793 m = *(uint32_t *) mask;
4798 mmx_mask = unpack_32_1x64 (m);
4799 mmx_dest = expand565_16_1x64 (d);
4801 *dst = pack_565_32_16 (
4804 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4816 /* -----------------------------------------------------------------------
4817 * composite_in_n_8_8
4821 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4823 pixman_image_t * src_image,
4824 pixman_image_t * mask_image,
4825 pixman_image_t * dst_image,
4835 uint8_t *dst_line, *dst;
4836 uint8_t *mask_line, *mask;
4837 int dst_stride, mask_stride;
4843 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4844 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4846 PIXMAN_IMAGE_GET_LINE (
4847 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4848 PIXMAN_IMAGE_GET_LINE (
4849 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4851 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4855 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4860 dst_line += dst_stride;
4862 mask_line += mask_stride;
4865 /* call prefetch hint to optimize cache load*/
4866 cache_prefetch ((__m128i*)mask);
4867 cache_prefetch ((__m128i*)dst);
4869 while (w && ((unsigned long)dst & 15))
4871 m = (uint32_t) *mask++;
4872 d = (uint32_t) *dst;
4874 *dst++ = (uint8_t) pack_1x64_32 (
4876 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4877 unpack_32_1x64 (m)),
4878 unpack_32_1x64 (d)));
4882 /* call prefetch hint to optimize cache load*/
4883 cache_prefetch ((__m128i*)mask);
4884 cache_prefetch ((__m128i*)dst);
4888 /* fill cache line with next memory */
4889 cache_prefetch_next ((__m128i*)mask);
4890 cache_prefetch_next ((__m128i*)dst);
4892 xmm_mask = load_128_unaligned ((__m128i*)mask);
4893 xmm_dst = load_128_aligned ((__m128i*)dst);
4895 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4896 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4898 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4899 &xmm_mask_lo, &xmm_mask_hi,
4900 &xmm_mask_lo, &xmm_mask_hi);
4902 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4903 &xmm_dst_lo, &xmm_dst_hi,
4904 &xmm_dst_lo, &xmm_dst_hi);
4907 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4916 m = (uint32_t) *mask++;
4917 d = (uint32_t) *dst;
4919 *dst++ = (uint8_t) pack_1x64_32 (
4922 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4923 unpack_32_1x64 (d)));
4931 /* ---------------------------------------------------------------------------
4936 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4938 pixman_image_t * src_image,
4939 pixman_image_t * mask_image,
4940 pixman_image_t * dst_image,
4950 uint8_t *dst_line, *dst;
4951 uint8_t *src_line, *src;
4952 int src_stride, dst_stride;
4956 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4957 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4959 PIXMAN_IMAGE_GET_LINE (
4960 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4961 PIXMAN_IMAGE_GET_LINE (
4962 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4967 dst_line += dst_stride;
4969 src_line += src_stride;
4972 /* call prefetch hint to optimize cache load*/
4973 cache_prefetch ((__m128i*)src);
4974 cache_prefetch ((__m128i*)dst);
4976 while (w && ((unsigned long)dst & 15))
4978 s = (uint32_t) *src++;
4979 d = (uint32_t) *dst;
4981 *dst++ = (uint8_t) pack_1x64_32 (
4983 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4987 /* call prefetch hint to optimize cache load*/
4988 cache_prefetch ((__m128i*)src);
4989 cache_prefetch ((__m128i*)dst);
4993 /* fill cache line with next memory */
4994 cache_prefetch_next ((__m128i*)src);
4995 cache_prefetch_next ((__m128i*)dst);
4997 xmm_src = load_128_unaligned ((__m128i*)src);
4998 xmm_dst = load_128_aligned ((__m128i*)dst);
5000 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5001 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5003 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5004 &xmm_dst_lo, &xmm_dst_hi,
5005 &xmm_dst_lo, &xmm_dst_hi);
5008 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5017 s = (uint32_t) *src++;
5018 d = (uint32_t) *dst;
5020 *dst++ = (uint8_t) pack_1x64_32 (
5021 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5029 /* -------------------------------------------------------------------------
5030 * composite_add_n_8_8
5034 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
5036 pixman_image_t * src_image,
5037 pixman_image_t * mask_image,
5038 pixman_image_t * dst_image,
5048 uint8_t *dst_line, *dst;
5049 uint8_t *mask_line, *mask;
5050 int dst_stride, mask_stride;
5057 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5058 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5060 PIXMAN_IMAGE_GET_LINE (
5061 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5062 PIXMAN_IMAGE_GET_LINE (
5063 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5065 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5069 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5074 dst_line += dst_stride;
5076 mask_line += mask_stride;
5079 /* call prefetch hint to optimize cache load*/
5080 cache_prefetch ((__m128i*)mask);
5081 cache_prefetch ((__m128i*)dst);
5083 while (w && ((unsigned long)dst & 15))
5085 m = (uint32_t) *mask++;
5086 d = (uint32_t) *dst;
5088 *dst++ = (uint8_t) pack_1x64_32 (
5091 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5092 unpack_32_1x64 (d)));
5096 /* call prefetch hint to optimize cache load*/
5097 cache_prefetch ((__m128i*)mask);
5098 cache_prefetch ((__m128i*)dst);
5102 /* fill cache line with next memory */
5103 cache_prefetch_next ((__m128i*)mask);
5104 cache_prefetch_next ((__m128i*)dst);
5106 xmm_mask = load_128_unaligned ((__m128i*)mask);
5107 xmm_dst = load_128_aligned ((__m128i*)dst);
5109 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5110 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5112 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5113 &xmm_mask_lo, &xmm_mask_hi,
5114 &xmm_mask_lo, &xmm_mask_hi);
5116 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5117 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5120 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5129 m = (uint32_t) *mask++;
5130 d = (uint32_t) *dst;
5132 *dst++ = (uint8_t) pack_1x64_32 (
5135 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5136 unpack_32_1x64 (d)));
5145 /* ----------------------------------------------------------------------
5146 * composite_add_8000_8000
5150 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5152 pixman_image_t * src_image,
5153 pixman_image_t * mask_image,
5154 pixman_image_t * dst_image,
5164 uint8_t *dst_line, *dst;
5165 uint8_t *src_line, *src;
5166 int dst_stride, src_stride;
5170 PIXMAN_IMAGE_GET_LINE (
5171 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5172 PIXMAN_IMAGE_GET_LINE (
5173 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5180 /* call prefetch hint to optimize cache load*/
5181 cache_prefetch ((__m128i*)src);
5182 cache_prefetch ((__m128i*)dst);
5184 dst_line += dst_stride;
5185 src_line += src_stride;
5189 while (w && (unsigned long)dst & 3)
5191 t = (*dst) + (*src++);
5192 *dst++ = t | (0 - (t >> 8));
5196 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5206 t = (*dst) + (*src++);
5207 *dst++ = t | (0 - (t >> 8));
5215 /* ---------------------------------------------------------------------
5216 * composite_add_8888_8888
5219 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5221 pixman_image_t * src_image,
5222 pixman_image_t * mask_image,
5223 pixman_image_t * dst_image,
5233 uint32_t *dst_line, *dst;
5234 uint32_t *src_line, *src;
5235 int dst_stride, src_stride;
5237 PIXMAN_IMAGE_GET_LINE (
5238 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5239 PIXMAN_IMAGE_GET_LINE (
5240 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5245 dst_line += dst_stride;
5247 src_line += src_stride;
5249 core_combine_add_u_sse2 (dst, src, NULL, width);
5255 /* -------------------------------------------------------------------------------------------------
5256 * sse2_composite_copy_area
5259 static pixman_bool_t
5260 pixman_blt_sse2 (uint32_t *src_bits,
5273 uint8_t * src_bytes;
5274 uint8_t * dst_bytes;
5277 if (src_bpp != dst_bpp)
5282 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5283 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5284 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5285 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5286 byte_width = 2 * width;
5290 else if (src_bpp == 32)
5292 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5293 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5294 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5295 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5296 byte_width = 4 * width;
5305 cache_prefetch ((__m128i*)src_bytes);
5306 cache_prefetch ((__m128i*)dst_bytes);
5311 uint8_t *s = src_bytes;
5312 uint8_t *d = dst_bytes;
5313 src_bytes += src_stride;
5314 dst_bytes += dst_stride;
5317 cache_prefetch_next ((__m128i*)s);
5318 cache_prefetch_next ((__m128i*)d);
5320 while (w >= 2 && ((unsigned long)d & 3))
5322 *(uint16_t *)d = *(uint16_t *)s;
5328 while (w >= 4 && ((unsigned long)d & 15))
5330 *(uint32_t *)d = *(uint32_t *)s;
5337 cache_prefetch_next ((__m128i*)s);
5338 cache_prefetch_next ((__m128i*)d);
5342 __m128i xmm0, xmm1, xmm2, xmm3;
5344 /* 128 bytes ahead */
5345 cache_prefetch (((__m128i*)s) + 8);
5346 cache_prefetch (((__m128i*)d) + 8);
5348 xmm0 = load_128_unaligned ((__m128i*)(s));
5349 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5350 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5351 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5353 save_128_aligned ((__m128i*)(d), xmm0);
5354 save_128_aligned ((__m128i*)(d + 16), xmm1);
5355 save_128_aligned ((__m128i*)(d + 32), xmm2);
5356 save_128_aligned ((__m128i*)(d + 48), xmm3);
5363 cache_prefetch_next ((__m128i*)s);
5364 cache_prefetch_next ((__m128i*)d);
5368 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5375 cache_prefetch_next ((__m128i*)s);
5376 cache_prefetch_next ((__m128i*)d);
5380 *(uint32_t *)d = *(uint32_t *)s;
5389 *(uint16_t *)d = *(uint16_t *)s;
5402 sse2_composite_copy_area (pixman_implementation_t *imp,
5404 pixman_image_t * src_image,
5405 pixman_image_t * mask_image,
5406 pixman_image_t * dst_image,
5416 pixman_blt_sse2 (src_image->bits.bits,
5417 dst_image->bits.bits,
5418 src_image->bits.rowstride,
5419 dst_image->bits.rowstride,
5420 PIXMAN_FORMAT_BPP (src_image->bits.format),
5421 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5422 src_x, src_y, dest_x, dest_y, width, height);
5426 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
5428 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5430 pixman_image_t * src_image,
5431 pixman_image_t * mask_image,
5432 pixman_image_t * dst_image,
5442 uint32_t *src, *src_line, s;
5443 uint32_t *dst, *dst_line, d;
5444 uint8_t *mask, *mask_line;
5446 int src_stride, mask_stride, dst_stride;
5449 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5450 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5451 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5453 PIXMAN_IMAGE_GET_LINE (
5454 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5455 PIXMAN_IMAGE_GET_LINE (
5456 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5457 PIXMAN_IMAGE_GET_LINE (
5458 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5463 src_line += src_stride;
5465 dst_line += dst_stride;
5467 mask_line += mask_stride;
5471 /* call prefetch hint to optimize cache load*/
5472 cache_prefetch ((__m128i*)src);
5473 cache_prefetch ((__m128i*)dst);
5474 cache_prefetch ((__m128i*)mask);
5476 while (w && (unsigned long)dst & 15)
5478 s = 0xff000000 | *src++;
5479 m = (uint32_t) *mask++;
5482 __m64 ms = unpack_32_1x64 (s);
5486 ms = in_over_1x64 (ms,
5488 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5489 unpack_32_1x64 (d));
5492 *dst++ = pack_1x64_32 (ms);
5496 /* call prefetch hint to optimize cache load*/
5497 cache_prefetch ((__m128i*)src);
5498 cache_prefetch ((__m128i*)dst);
5499 cache_prefetch ((__m128i*)mask);
5503 /* fill cache line with next memory */
5504 cache_prefetch_next ((__m128i*)src);
5505 cache_prefetch_next ((__m128i*)dst);
5506 cache_prefetch_next ((__m128i*)mask);
5508 m = *(uint32_t*) mask;
5509 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5511 if (m == 0xffffffff)
5513 save_128_aligned ((__m128i*)dst, xmm_src);
5517 xmm_dst = load_128_aligned ((__m128i*)dst);
5519 xmm_mask = _mm_unpacklo_epi16 (
5520 unpack_32_1x128 (m), _mm_setzero_si128 ());
5522 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5523 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5524 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5526 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
5527 &xmm_mask_lo, &xmm_mask_hi);
5529 in_over_2x128 (xmm_src_lo, xmm_src_hi,
5530 mask_00ff, mask_00ff,
5531 xmm_mask_lo, xmm_mask_hi,
5532 &xmm_dst_lo, &xmm_dst_hi);
5535 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5546 m = (uint32_t) *mask++;
5550 s = 0xff000000 | *src;
5560 *dst = pack_1x64_32 (
5564 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5565 unpack_32_1x64 (d)));
5581 static const pixman_fast_path_t sse2_fast_paths[] =
5583 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 },
5584 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 },
5585 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 },
5586 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 },
5587 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 },
5588 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
5589 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
5590 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
5591 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
5592 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
5593 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
5594 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5595 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5596 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5597 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5599 /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
5600 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5601 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5602 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
5603 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5605 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5606 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5607 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5608 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5609 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5610 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5611 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5612 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5613 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5614 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5615 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5616 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5617 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5618 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5619 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5620 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5621 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5622 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5623 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5624 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5625 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5626 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5627 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5628 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5629 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5630 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5631 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5632 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5634 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5635 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 },
5636 { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 },
5637 { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 },
5638 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_n_8_8, 0 },
5640 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5641 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5642 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5643 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5644 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 },
5645 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 },
5646 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5647 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5648 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5649 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5650 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 },
5651 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 },
5653 { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 },
5654 { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 },
5660 * Work around GCC bug causing crashes in Mozilla with SSE2
5662 * When using -msse, gcc generates movdqa instructions assuming that
5663 * the stack is 16 byte aligned. Unfortunately some applications, such
5664 * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
5665 * causes the movdqa instructions to fail.
5667 * The __force_align_arg_pointer__ makes gcc generate a prologue that
5668 * realigns the stack pointer to 16 bytes.
5670 * On x86-64 this is not necessary because the standard ABI already
5671 * calls for a 16 byte aligned stack.
5673 * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
5675 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5676 __attribute__((__force_align_arg_pointer__))
5679 sse2_composite (pixman_implementation_t *imp,
5681 pixman_image_t * src,
5682 pixman_image_t * mask,
5683 pixman_image_t * dest,
5693 if (_pixman_run_fast_path (sse2_fast_paths, imp,
5694 op, src, mask, dest,
5703 _pixman_implementation_composite (imp->delegate, op,
5711 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5712 __attribute__((__force_align_arg_pointer__))
5714 static pixman_bool_t
5715 sse2_blt (pixman_implementation_t *imp,
5716 uint32_t * src_bits,
5717 uint32_t * dst_bits,
5729 if (!pixman_blt_sse2 (
5730 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5731 src_x, src_y, dst_x, dst_y, width, height))
5734 return _pixman_implementation_blt (
5736 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5737 src_x, src_y, dst_x, dst_y, width, height);
5743 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5744 __attribute__((__force_align_arg_pointer__))
5746 static pixman_bool_t
5747 sse2_fill (pixman_implementation_t *imp,
5757 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5759 return _pixman_implementation_fill (
5760 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5766 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5767 __attribute__((__force_align_arg_pointer__))
5769 pixman_implementation_t *
5770 _pixman_implementation_create_sse2 (void)
5772 pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5773 pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5775 /* SSE2 constants */
5776 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5777 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5778 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5779 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5780 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5781 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5782 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5783 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5784 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
5785 mask_0080 = create_mask_16_128 (0x0080);
5786 mask_00ff = create_mask_16_128 (0x00ff);
5787 mask_0101 = create_mask_16_128 (0x0101);
5788 mask_ffff = create_mask_16_128 (0xffff);
5789 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5790 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5793 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5794 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5796 mask_x0080 = create_mask_16_64 (0x0080);
5797 mask_x00ff = create_mask_16_64 (0x00ff);
5798 mask_x0101 = create_mask_16_64 (0x0101);
5799 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5803 /* Set up function pointers */
5805 /* SSE code patch for fbcompose.c */
5806 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5807 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5808 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5809 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5810 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5811 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5812 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5813 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5814 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5815 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5817 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5819 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5820 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5821 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5822 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5823 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5824 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5825 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5826 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5827 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5828 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5829 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5831 imp->composite = sse2_composite;
5832 imp->blt = sse2_blt;
5833 imp->fill = sse2_fill;
5838 #endif /* USE_SSE2 */