2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41 * the pixman-x64-mmx-emulation.h file contains
42 * implementations of those MMX intrinsics that
43 * are used in the SSE2 implementation.
45 # include "pixman-x64-mmx-emulation.h"
50 /* --------------------------------------------------------------------
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
79 /* ----------------------------------------------------------------------
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
85 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
91 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
98 __m128i r, g, b, rb, t;
100 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
104 rb = _mm_or_si128 (r, b);
105 t = _mm_and_si128 (rb, mask_565_fix_rb);
106 t = _mm_srli_epi32 (t, 5);
107 rb = _mm_or_si128 (rb, t);
109 t = _mm_and_si128 (g, mask_565_fix_g);
110 t = _mm_srli_epi32 (t, 6);
111 g = _mm_or_si128 (g, t);
113 return _mm_or_si128 (rb, g);
116 static force_inline void
117 unpack_565_128_4x128 (__m128i data,
125 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
128 lo = unpack_565_to_8888 (lo);
129 hi = unpack_565_to_8888 (hi);
131 unpack_128_2x128 (lo, data0, data1);
132 unpack_128_2x128 (hi, data2, data3);
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
138 return (uint16_t) (((pixel >> 8) & 0xf800) |
139 ((pixel >> 5) & 0x07e0) |
140 ((pixel >> 3) & 0x001f));
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
146 return _mm_packus_epi16 (lo, hi);
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
153 __m128i r, g1, g2, b;
155 data = pack_2x128_128 (lo, hi);
157 r = _mm_and_si128 (data, mask_565_r);
158 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
162 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
168 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169 pack_565_2x128_128 (*xmm2, *xmm3));
172 static force_inline int
173 is_opaque (__m128i x)
175 __m128i ffs = _mm_cmpeq_epi8 (x, x);
177 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
180 static force_inline int
183 return _mm_movemask_epi8 (
184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
187 static force_inline int
188 is_transparent (__m128i x)
190 return (_mm_movemask_epi8 (
191 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
197 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
203 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204 _MM_SHUFFLE (3, 3, 3, 3)),
205 _MM_SHUFFLE (3, 3, 3, 3));
208 static force_inline void
209 expand_alpha_2x128 (__m128i data_lo,
216 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
219 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i data_lo,
231 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
247 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249 lo = _mm_adds_epu16 (lo, mask_0080);
250 hi = _mm_adds_epu16 (hi, mask_0080);
251 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
258 __m128i* alpha_dst_lo,
259 __m128i* alpha_dst_hi,
262 __m128i* alpha_src_lo,
263 __m128i* alpha_src_hi,
267 __m128i t1_lo, t1_hi;
268 __m128i t2_lo, t2_hi;
270 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
273 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
277 static force_inline void
278 negate_2x128 (__m128i data_lo,
283 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
287 static force_inline void
288 invert_colors_2x128 (__m128i data_lo,
295 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
311 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
313 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
315 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i src_lo,
326 __m128i alpha_lo, alpha_hi;
328 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
330 lo = _mm_or_si128 (alpha_lo, mask_alpha);
331 hi = _mm_or_si128 (alpha_hi, mask_alpha);
333 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
335 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
337 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
353 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
356 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
359 static force_inline void
360 cache_prefetch (__m128i* addr)
362 _mm_prefetch ((void const*)addr, _MM_HINT_T0);
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
368 _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
371 /* load 4 pixels from a 16-byte boundary aligned address */
372 static force_inline __m128i
373 load_128_aligned (__m128i* src)
375 return _mm_load_si128 (src);
378 /* load 4 pixels from a unaligned address */
379 static force_inline __m128i
380 load_128_unaligned (const __m128i* src)
382 return _mm_loadu_si128 (src);
385 /* save 4 pixels using Write Combining memory on a 16-byte
386 * boundary aligned address
388 static force_inline void
389 save_128_write_combining (__m128i* dst,
392 _mm_stream_si128 (dst, data);
395 /* save 4 pixels on a 16-byte boundary aligned address */
396 static force_inline void
397 save_128_aligned (__m128i* dst,
400 _mm_store_si128 (dst, data);
403 /* save 4 pixels on a unaligned address */
404 static force_inline void
405 save_128_unaligned (__m128i* dst,
408 _mm_storeu_si128 (dst, data);
411 /* ------------------------------------------------------------------
415 static force_inline __m64
416 unpack_32_1x64 (uint32_t data)
418 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
421 static force_inline __m64
422 expand_alpha_1x64 (__m64 data)
424 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
427 static force_inline __m64
428 expand_alpha_rev_1x64 (__m64 data)
430 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
433 static force_inline __m64
434 expand_pixel_8_1x64 (uint8_t data)
436 return _mm_shuffle_pi16 (
437 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
440 static force_inline __m64
441 pix_multiply_1x64 (__m64 data,
444 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
449 static force_inline __m64
450 pix_add_multiply_1x64 (__m64* src,
455 __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
456 __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
458 return _mm_adds_pu8 (t1, t2);
461 static force_inline __m64
462 negate_1x64 (__m64 data)
464 return _mm_xor_si64 (data, mask_x00ff);
467 static force_inline __m64
468 invert_colors_1x64 (__m64 data)
470 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
473 static force_inline __m64
474 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
476 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
479 static force_inline __m64
480 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
482 return over_1x64 (pix_multiply_1x64 (*src, *mask),
483 pix_multiply_1x64 (*alpha, *mask),
487 static force_inline __m64
488 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
490 __m64 alpha = expand_alpha_1x64 (src);
492 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
493 _mm_or_si64 (alpha, mask_x_alpha)),
498 static force_inline uint32_t
499 pack_1x64_32 (__m64 data)
501 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
504 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
508 * --- Expanding 565 in the low word ---
510 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
511 * m = m & (01f0003f001f);
512 * m = m * (008404100840);
515 * Note the trick here - the top word is shifted by another nibble to
516 * avoid it bumping into the middle word
518 static force_inline __m64
519 expand565_16_1x64 (uint16_t pixel)
524 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
526 t1 = _mm_slli_si64 (p, 36 - 11);
527 t2 = _mm_slli_si64 (p, 16 - 5);
529 p = _mm_or_si64 (t1, p);
530 p = _mm_or_si64 (t2, p);
531 p = _mm_and_si64 (p, mask_x565_rgb);
532 p = _mm_mullo_pi16 (p, mask_x565_unpack);
534 return _mm_srli_pi16 (p, 8);
537 /* ----------------------------------------------------------------------------
538 * Compose Core transformations
540 static force_inline uint32_t
541 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
554 ms = unpack_32_1x64 (src);
555 return pack_1x64_32 (
556 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
562 static force_inline uint32_t
563 combine1 (const uint32_t *ps, const uint32_t *pm)
571 mm = unpack_32_1x64 (*pm);
572 mm = expand_alpha_1x64 (mm);
574 ms = unpack_32_1x64 (s);
575 ms = pix_multiply_1x64 (ms, mm);
577 s = pack_1x64_32 (ms);
583 static force_inline __m128i
584 combine4 (const __m128i *ps, const __m128i *pm)
586 __m128i xmm_src_lo, xmm_src_hi;
587 __m128i xmm_msk_lo, xmm_msk_hi;
592 xmm_msk_lo = load_128_unaligned (pm);
594 if (is_transparent (xmm_msk_lo))
595 return _mm_setzero_si128 ();
598 s = load_128_unaligned (ps);
602 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
603 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
605 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
607 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
608 &xmm_msk_lo, &xmm_msk_hi,
609 &xmm_src_lo, &xmm_src_hi);
611 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
617 static force_inline void
618 core_combine_over_u_sse2 (uint32_t* pd,
625 __m128i xmm_dst_lo, xmm_dst_hi;
626 __m128i xmm_src_lo, xmm_src_hi;
627 __m128i xmm_alpha_lo, xmm_alpha_hi;
629 /* call prefetch hint to optimize cache load*/
630 cache_prefetch ((__m128i*)ps);
631 cache_prefetch ((__m128i*)pd);
632 cache_prefetch ((__m128i*)pm);
634 /* Align dst on a 16-byte boundary */
635 while (w && ((unsigned long)pd & 15))
638 s = combine1 (ps, pm);
640 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
647 /* call prefetch hint to optimize cache load*/
648 cache_prefetch ((__m128i*)ps);
649 cache_prefetch ((__m128i*)pd);
650 cache_prefetch ((__m128i*)pm);
654 /* fill cache line with next memory */
655 cache_prefetch_next ((__m128i*)ps);
656 cache_prefetch_next ((__m128i*)pd);
657 cache_prefetch_next ((__m128i*)pm);
659 /* I'm loading unaligned because I'm not sure about
660 * the address alignment.
662 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
664 if (is_opaque (xmm_src_hi))
666 save_128_aligned ((__m128i*)pd, xmm_src_hi);
668 else if (!is_zero (xmm_src_hi))
670 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
672 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
673 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
676 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
678 over_2x128 (&xmm_src_lo, &xmm_src_hi,
679 &xmm_alpha_lo, &xmm_alpha_hi,
680 &xmm_dst_lo, &xmm_dst_hi);
682 /* rebuid the 4 pixel data and save*/
683 save_128_aligned ((__m128i*)pd,
684 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
697 s = combine1 (ps, pm);
699 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
708 static force_inline void
709 core_combine_over_reverse_u_sse2 (uint32_t* pd,
716 __m128i xmm_dst_lo, xmm_dst_hi;
717 __m128i xmm_src_lo, xmm_src_hi;
718 __m128i xmm_alpha_lo, xmm_alpha_hi;
720 /* call prefetch hint to optimize cache load*/
721 cache_prefetch ((__m128i*)ps);
722 cache_prefetch ((__m128i*)pd);
723 cache_prefetch ((__m128i*)pm);
725 /* Align dst on a 16-byte boundary */
727 ((unsigned long)pd & 15))
730 s = combine1 (ps, pm);
732 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
739 /* call prefetch hint to optimize cache load*/
740 cache_prefetch ((__m128i*)ps);
741 cache_prefetch ((__m128i*)pd);
742 cache_prefetch ((__m128i*)pm);
746 /* fill cache line with next memory */
747 cache_prefetch_next ((__m128i*)ps);
748 cache_prefetch_next ((__m128i*)pd);
749 cache_prefetch_next ((__m128i*)pm);
751 /* I'm loading unaligned because I'm not sure
752 * about the address alignment.
754 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
755 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
757 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
758 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
760 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
761 &xmm_alpha_lo, &xmm_alpha_hi);
763 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
764 &xmm_alpha_lo, &xmm_alpha_hi,
765 &xmm_src_lo, &xmm_src_hi);
767 /* rebuid the 4 pixel data and save*/
768 save_128_aligned ((__m128i*)pd,
769 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
782 s = combine1 (ps, pm);
784 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
792 static force_inline uint32_t
793 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
795 uint32_t maska = src >> 24;
801 else if (maska != 0xff)
803 return pack_1x64_32 (
804 pix_multiply_1x64 (unpack_32_1x64 (dst),
805 expand_alpha_1x64 (unpack_32_1x64 (src))));
811 static force_inline void
812 core_combine_in_u_sse2 (uint32_t* pd,
819 __m128i xmm_src_lo, xmm_src_hi;
820 __m128i xmm_dst_lo, xmm_dst_hi;
822 /* call prefetch hint to optimize cache load*/
823 cache_prefetch ((__m128i*)ps);
824 cache_prefetch ((__m128i*)pd);
825 cache_prefetch ((__m128i*)pm);
827 while (w && ((unsigned long) pd & 15))
829 s = combine1 (ps, pm);
832 *pd++ = core_combine_in_u_pixelsse2 (d, s);
839 /* call prefetch hint to optimize cache load*/
840 cache_prefetch ((__m128i*)ps);
841 cache_prefetch ((__m128i*)pd);
842 cache_prefetch ((__m128i*)pm);
846 /* fill cache line with next memory */
847 cache_prefetch_next ((__m128i*)ps);
848 cache_prefetch_next ((__m128i*)pd);
849 cache_prefetch_next ((__m128i*)pm);
851 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
852 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
854 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
855 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
857 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
858 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
859 &xmm_dst_lo, &xmm_dst_hi,
860 &xmm_dst_lo, &xmm_dst_hi);
862 save_128_aligned ((__m128i*)pd,
863 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
874 s = combine1 (ps, pm);
877 *pd++ = core_combine_in_u_pixelsse2 (d, s);
885 static force_inline void
886 core_combine_reverse_in_u_sse2 (uint32_t* pd,
893 __m128i xmm_src_lo, xmm_src_hi;
894 __m128i xmm_dst_lo, xmm_dst_hi;
896 /* call prefetch hint to optimize cache load*/
897 cache_prefetch ((__m128i*)ps);
898 cache_prefetch ((__m128i*)pd);
899 cache_prefetch ((__m128i*)pm);
901 while (w && ((unsigned long) pd & 15))
903 s = combine1 (ps, pm);
906 *pd++ = core_combine_in_u_pixelsse2 (s, d);
913 /* call prefetch hint to optimize cache load*/
914 cache_prefetch ((__m128i*)ps);
915 cache_prefetch ((__m128i*)pd);
916 cache_prefetch ((__m128i*)pm);
920 /* fill cache line with next memory */
921 cache_prefetch_next ((__m128i*)ps);
922 cache_prefetch_next ((__m128i*)pd);
923 cache_prefetch_next ((__m128i*)pm);
925 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
926 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
928 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
929 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
931 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
932 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
933 &xmm_src_lo, &xmm_src_hi,
934 &xmm_dst_lo, &xmm_dst_hi);
937 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
948 s = combine1 (ps, pm);
951 *pd++ = core_combine_in_u_pixelsse2 (s, d);
959 static force_inline void
960 core_combine_reverse_out_u_sse2 (uint32_t* pd,
965 /* call prefetch hint to optimize cache load*/
966 cache_prefetch ((__m128i*)ps);
967 cache_prefetch ((__m128i*)pd);
968 cache_prefetch ((__m128i*)pm);
970 while (w && ((unsigned long) pd & 15))
972 uint32_t s = combine1 (ps, pm);
975 *pd++ = pack_1x64_32 (
977 unpack_32_1x64 (d), negate_1x64 (
978 expand_alpha_1x64 (unpack_32_1x64 (s)))));
986 /* call prefetch hint to optimize cache load*/
987 cache_prefetch ((__m128i*)ps);
988 cache_prefetch ((__m128i*)pd);
989 cache_prefetch ((__m128i*)pm);
993 __m128i xmm_src_lo, xmm_src_hi;
994 __m128i xmm_dst_lo, xmm_dst_hi;
996 /* fill cache line with next memory */
997 cache_prefetch_next ((__m128i*)ps);
998 cache_prefetch_next ((__m128i*)pd);
999 cache_prefetch_next ((__m128i*)pm);
1001 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1002 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1004 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1005 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1007 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1008 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1010 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1011 &xmm_src_lo, &xmm_src_hi,
1012 &xmm_dst_lo, &xmm_dst_hi);
1015 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1027 uint32_t s = combine1 (ps, pm);
1030 *pd++ = pack_1x64_32 (
1032 unpack_32_1x64 (d), negate_1x64 (
1033 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1041 static force_inline void
1042 core_combine_out_u_sse2 (uint32_t* pd,
1047 /* call prefetch hint to optimize cache load*/
1048 cache_prefetch ((__m128i*)ps);
1049 cache_prefetch ((__m128i*)pd);
1050 cache_prefetch ((__m128i*)pm);
1052 while (w && ((unsigned long) pd & 15))
1054 uint32_t s = combine1 (ps, pm);
1057 *pd++ = pack_1x64_32 (
1059 unpack_32_1x64 (s), negate_1x64 (
1060 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1067 /* call prefetch hint to optimize cache load*/
1068 cache_prefetch ((__m128i*)ps);
1069 cache_prefetch ((__m128i*)pd);
1070 cache_prefetch ((__m128i*)pm);
1074 __m128i xmm_src_lo, xmm_src_hi;
1075 __m128i xmm_dst_lo, xmm_dst_hi;
1077 /* fill cache line with next memory */
1078 cache_prefetch_next ((__m128i*)ps);
1079 cache_prefetch_next ((__m128i*)pd);
1080 cache_prefetch_next ((__m128i*)pm);
1082 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1083 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1085 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1086 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1088 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1089 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1091 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1092 &xmm_dst_lo, &xmm_dst_hi,
1093 &xmm_dst_lo, &xmm_dst_hi);
1096 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1107 uint32_t s = combine1 (ps, pm);
1110 *pd++ = pack_1x64_32 (
1112 unpack_32_1x64 (s), negate_1x64 (
1113 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1121 static force_inline uint32_t
1122 core_combine_atop_u_pixel_sse2 (uint32_t src,
1125 __m64 s = unpack_32_1x64 (src);
1126 __m64 d = unpack_32_1x64 (dst);
1128 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1129 __m64 da = expand_alpha_1x64 (d);
1131 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1134 static force_inline void
1135 core_combine_atop_u_sse2 (uint32_t* pd,
1142 __m128i xmm_src_lo, xmm_src_hi;
1143 __m128i xmm_dst_lo, xmm_dst_hi;
1144 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1145 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1147 /* call prefetch hint to optimize cache load*/
1148 cache_prefetch ((__m128i*)ps);
1149 cache_prefetch ((__m128i*)pd);
1150 cache_prefetch ((__m128i*)pm);
1152 while (w && ((unsigned long) pd & 15))
1154 s = combine1 (ps, pm);
1157 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1164 /* call prefetch hint to optimize cache load*/
1165 cache_prefetch ((__m128i*)ps);
1166 cache_prefetch ((__m128i*)pd);
1167 cache_prefetch ((__m128i*)pm);
1171 /* fill cache line with next memory */
1172 cache_prefetch_next ((__m128i*)ps);
1173 cache_prefetch_next ((__m128i*)pd);
1174 cache_prefetch_next ((__m128i*)pm);
1176 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1177 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1179 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1180 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1182 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1183 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1184 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1185 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1187 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1188 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1190 pix_add_multiply_2x128 (
1191 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1192 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1193 &xmm_dst_lo, &xmm_dst_hi);
1196 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1207 s = combine1 (ps, pm);
1210 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1218 static force_inline uint32_t
1219 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1222 __m64 s = unpack_32_1x64 (src);
1223 __m64 d = unpack_32_1x64 (dst);
1225 __m64 sa = expand_alpha_1x64 (s);
1226 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1228 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1231 static force_inline void
1232 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1239 __m128i xmm_src_lo, xmm_src_hi;
1240 __m128i xmm_dst_lo, xmm_dst_hi;
1241 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1242 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1244 /* call prefetch hint to optimize cache load*/
1245 cache_prefetch ((__m128i*)ps);
1246 cache_prefetch ((__m128i*)pd);
1247 cache_prefetch ((__m128i*)pm);
1249 while (w && ((unsigned long) pd & 15))
1251 s = combine1 (ps, pm);
1254 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1261 /* call prefetch hint to optimize cache load*/
1262 cache_prefetch ((__m128i*)ps);
1263 cache_prefetch ((__m128i*)pd);
1264 cache_prefetch ((__m128i*)pm);
1268 /* fill cache line with next memory */
1269 cache_prefetch_next ((__m128i*)ps);
1270 cache_prefetch_next ((__m128i*)pd);
1271 cache_prefetch_next ((__m128i*)pm);
1273 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1274 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1276 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1277 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1279 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1280 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1281 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1282 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1284 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1285 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1287 pix_add_multiply_2x128 (
1288 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1289 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1290 &xmm_dst_lo, &xmm_dst_hi);
1293 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1304 s = combine1 (ps, pm);
1307 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1315 static force_inline uint32_t
1316 core_combine_xor_u_pixel_sse2 (uint32_t src,
1319 __m64 s = unpack_32_1x64 (src);
1320 __m64 d = unpack_32_1x64 (dst);
1322 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1323 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1325 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1328 static force_inline void
1329 core_combine_xor_u_sse2 (uint32_t* dst,
1330 const uint32_t* src,
1331 const uint32_t *mask,
1337 const uint32_t* ps = src;
1338 const uint32_t* pm = mask;
1340 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1341 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1342 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1343 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1345 /* call prefetch hint to optimize cache load*/
1346 cache_prefetch ((__m128i*)ps);
1347 cache_prefetch ((__m128i*)pd);
1348 cache_prefetch ((__m128i*)pm);
1350 while (w && ((unsigned long) pd & 15))
1352 s = combine1 (ps, pm);
1355 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1362 /* call prefetch hint to optimize cache load*/
1363 cache_prefetch ((__m128i*)ps);
1364 cache_prefetch ((__m128i*)pd);
1365 cache_prefetch ((__m128i*)pm);
1369 /* fill cache line with next memory */
1370 cache_prefetch_next ((__m128i*)ps);
1371 cache_prefetch_next ((__m128i*)pd);
1372 cache_prefetch_next ((__m128i*)pm);
1374 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1375 xmm_dst = load_128_aligned ((__m128i*) pd);
1377 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1378 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1380 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1381 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1382 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1383 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1385 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1386 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1387 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1388 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1390 pix_add_multiply_2x128 (
1391 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1392 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1393 &xmm_dst_lo, &xmm_dst_hi);
1396 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1407 s = combine1 (ps, pm);
1410 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1418 static force_inline void
1419 core_combine_add_u_sse2 (uint32_t* dst,
1420 const uint32_t* src,
1421 const uint32_t* mask,
1427 const uint32_t* ps = src;
1428 const uint32_t* pm = mask;
1430 /* call prefetch hint to optimize cache load*/
1431 cache_prefetch ((__m128i*)ps);
1432 cache_prefetch ((__m128i*)pd);
1433 cache_prefetch ((__m128i*)pm);
1435 while (w && (unsigned long)pd & 15)
1437 s = combine1 (ps, pm);
1443 *pd++ = _mm_cvtsi64_si32 (
1444 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1448 /* call prefetch hint to optimize cache load*/
1449 cache_prefetch ((__m128i*)ps);
1450 cache_prefetch ((__m128i*)pd);
1451 cache_prefetch ((__m128i*)pm);
1457 /* fill cache line with next memory */
1458 cache_prefetch_next ((__m128i*)ps);
1459 cache_prefetch_next ((__m128i*)pd);
1460 cache_prefetch_next ((__m128i*)pm);
1462 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1465 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1476 s = combine1 (ps, pm);
1480 *pd++ = _mm_cvtsi64_si32 (
1481 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1487 static force_inline uint32_t
1488 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1491 __m64 ms = unpack_32_1x64 (src);
1492 __m64 md = unpack_32_1x64 (dst);
1493 uint32_t sa = src >> 24;
1494 uint32_t da = ~dst >> 24;
1498 ms = pix_multiply_1x64 (
1499 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1502 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1505 static force_inline void
1506 core_combine_saturate_u_sse2 (uint32_t * pd,
1514 __m128i xmm_src, xmm_dst;
1516 /* call prefetch hint to optimize cache load*/
1517 cache_prefetch ((__m128i*)ps);
1518 cache_prefetch ((__m128i*)pd);
1519 cache_prefetch ((__m128i*)pm);
1521 while (w && (unsigned long)pd & 15)
1523 s = combine1 (ps, pm);
1526 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1533 /* call prefetch hint to optimize cache load*/
1534 cache_prefetch ((__m128i*)ps);
1535 cache_prefetch ((__m128i*)pd);
1536 cache_prefetch ((__m128i*)pm);
1540 /* fill cache line with next memory */
1541 cache_prefetch_next ((__m128i*)ps);
1542 cache_prefetch_next ((__m128i*)pd);
1543 cache_prefetch_next ((__m128i*)pm);
1545 xmm_dst = load_128_aligned ((__m128i*)pd);
1546 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1548 pack_cmp = _mm_movemask_epi8 (
1550 _mm_srli_epi32 (xmm_src, 24),
1551 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1553 /* if some alpha src is grater than respective ~alpha dst */
1556 s = combine1 (ps++, pm);
1558 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1562 s = combine1 (ps++, pm);
1564 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1568 s = combine1 (ps++, pm);
1570 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1574 s = combine1 (ps++, pm);
1576 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1582 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1595 s = combine1 (ps, pm);
1598 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1605 static force_inline void
1606 core_combine_src_ca_sse2 (uint32_t* pd,
1613 __m128i xmm_src_lo, xmm_src_hi;
1614 __m128i xmm_mask_lo, xmm_mask_hi;
1615 __m128i xmm_dst_lo, xmm_dst_hi;
1617 /* call prefetch hint to optimize cache load*/
1618 cache_prefetch ((__m128i*)ps);
1619 cache_prefetch ((__m128i*)pd);
1620 cache_prefetch ((__m128i*)pm);
1622 while (w && (unsigned long)pd & 15)
1626 *pd++ = pack_1x64_32 (
1627 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1631 /* call prefetch hint to optimize cache load*/
1632 cache_prefetch ((__m128i*)ps);
1633 cache_prefetch ((__m128i*)pd);
1634 cache_prefetch ((__m128i*)pm);
1638 /* fill cache line with next memory */
1639 cache_prefetch_next ((__m128i*)ps);
1640 cache_prefetch_next ((__m128i*)pd);
1641 cache_prefetch_next ((__m128i*)pm);
1643 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1644 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1646 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1647 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1649 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1650 &xmm_mask_lo, &xmm_mask_hi,
1651 &xmm_dst_lo, &xmm_dst_hi);
1654 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1666 *pd++ = pack_1x64_32 (
1667 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1672 static force_inline uint32_t
1673 core_combine_over_ca_pixel_sse2 (uint32_t src,
1677 __m64 s = unpack_32_1x64 (src);
1678 __m64 expAlpha = expand_alpha_1x64 (s);
1679 __m64 unpk_mask = unpack_32_1x64 (mask);
1680 __m64 unpk_dst = unpack_32_1x64 (dst);
1682 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1685 static force_inline void
1686 core_combine_over_ca_sse2 (uint32_t* pd,
1693 __m128i xmm_alpha_lo, xmm_alpha_hi;
1694 __m128i xmm_src_lo, xmm_src_hi;
1695 __m128i xmm_dst_lo, xmm_dst_hi;
1696 __m128i xmm_mask_lo, xmm_mask_hi;
1698 /* call prefetch hint to optimize cache load*/
1699 cache_prefetch ((__m128i*)ps);
1700 cache_prefetch ((__m128i*)pd);
1701 cache_prefetch ((__m128i*)pm);
1703 while (w && (unsigned long)pd & 15)
1709 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1713 /* call prefetch hint to optimize cache load*/
1714 cache_prefetch ((__m128i*)ps);
1715 cache_prefetch ((__m128i*)pd);
1716 cache_prefetch ((__m128i*)pm);
1720 /* fill cache line with next memory */
1721 cache_prefetch_next ((__m128i*)ps);
1722 cache_prefetch_next ((__m128i*)pd);
1723 cache_prefetch_next ((__m128i*)pm);
1725 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1726 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1727 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1729 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1730 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1731 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1733 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1734 &xmm_alpha_lo, &xmm_alpha_hi);
1736 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1737 &xmm_alpha_lo, &xmm_alpha_hi,
1738 &xmm_mask_lo, &xmm_mask_hi,
1739 &xmm_dst_lo, &xmm_dst_hi);
1742 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1756 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1761 static force_inline uint32_t
1762 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1766 __m64 d = unpack_32_1x64 (dst);
1768 return pack_1x64_32 (
1769 over_1x64 (d, expand_alpha_1x64 (d),
1770 pix_multiply_1x64 (unpack_32_1x64 (src),
1771 unpack_32_1x64 (mask))));
1774 static force_inline void
1775 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1782 __m128i xmm_alpha_lo, xmm_alpha_hi;
1783 __m128i xmm_src_lo, xmm_src_hi;
1784 __m128i xmm_dst_lo, xmm_dst_hi;
1785 __m128i xmm_mask_lo, xmm_mask_hi;
1787 /* call prefetch hint to optimize cache load*/
1788 cache_prefetch ((__m128i*)ps);
1789 cache_prefetch ((__m128i*)pd);
1790 cache_prefetch ((__m128i*)pm);
1792 while (w && (unsigned long)pd & 15)
1798 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1802 /* call prefetch hint to optimize cache load*/
1803 cache_prefetch ((__m128i*)ps);
1804 cache_prefetch ((__m128i*)pd);
1805 cache_prefetch ((__m128i*)pm);
1809 /* fill cache line with next memory */
1810 cache_prefetch_next ((__m128i*)ps);
1811 cache_prefetch_next ((__m128i*)pd);
1812 cache_prefetch_next ((__m128i*)pm);
1814 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1815 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1816 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1818 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1819 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1820 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1822 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1823 &xmm_alpha_lo, &xmm_alpha_hi);
1824 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1825 &xmm_mask_lo, &xmm_mask_hi,
1826 &xmm_mask_lo, &xmm_mask_hi);
1828 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1829 &xmm_alpha_lo, &xmm_alpha_hi,
1830 &xmm_mask_lo, &xmm_mask_hi);
1833 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1847 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1852 static force_inline void
1853 core_combine_in_ca_sse2 (uint32_t * pd,
1860 __m128i xmm_alpha_lo, xmm_alpha_hi;
1861 __m128i xmm_src_lo, xmm_src_hi;
1862 __m128i xmm_dst_lo, xmm_dst_hi;
1863 __m128i xmm_mask_lo, xmm_mask_hi;
1865 /* call prefetch hint to optimize cache load*/
1866 cache_prefetch ((__m128i*)ps);
1867 cache_prefetch ((__m128i*)pd);
1868 cache_prefetch ((__m128i*)pm);
1870 while (w && (unsigned long)pd & 15)
1876 *pd++ = pack_1x64_32 (
1878 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1879 expand_alpha_1x64 (unpack_32_1x64 (d))));
1884 /* call prefetch hint to optimize cache load*/
1885 cache_prefetch ((__m128i*)ps);
1886 cache_prefetch ((__m128i*)pd);
1887 cache_prefetch ((__m128i*)pm);
1891 /* fill cache line with next memory */
1892 cache_prefetch_next ((__m128i*)ps);
1893 cache_prefetch_next ((__m128i*)pd);
1894 cache_prefetch_next ((__m128i*)pm);
1896 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1897 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1898 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1900 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1901 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1902 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1904 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1905 &xmm_alpha_lo, &xmm_alpha_hi);
1907 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1908 &xmm_mask_lo, &xmm_mask_hi,
1909 &xmm_dst_lo, &xmm_dst_hi);
1911 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1912 &xmm_alpha_lo, &xmm_alpha_hi,
1913 &xmm_dst_lo, &xmm_dst_hi);
1916 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1930 *pd++ = pack_1x64_32 (
1933 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1934 expand_alpha_1x64 (unpack_32_1x64 (d))));
1940 static force_inline void
1941 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1948 __m128i xmm_alpha_lo, xmm_alpha_hi;
1949 __m128i xmm_src_lo, xmm_src_hi;
1950 __m128i xmm_dst_lo, xmm_dst_hi;
1951 __m128i xmm_mask_lo, xmm_mask_hi;
1953 /* call prefetch hint to optimize cache load*/
1954 cache_prefetch ((__m128i*)ps);
1955 cache_prefetch ((__m128i*)pd);
1956 cache_prefetch ((__m128i*)pm);
1958 while (w && (unsigned long)pd & 15)
1964 *pd++ = pack_1x64_32 (
1967 pix_multiply_1x64 (unpack_32_1x64 (m),
1968 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1972 /* call prefetch hint to optimize cache load*/
1973 cache_prefetch ((__m128i*)ps);
1974 cache_prefetch ((__m128i*)pd);
1975 cache_prefetch ((__m128i*)pm);
1979 /* fill cache line with next memory */
1980 cache_prefetch_next ((__m128i*)ps);
1981 cache_prefetch_next ((__m128i*)pd);
1982 cache_prefetch_next ((__m128i*)pm);
1984 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1985 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1986 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1988 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1989 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1990 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1992 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1993 &xmm_alpha_lo, &xmm_alpha_hi);
1994 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1995 &xmm_alpha_lo, &xmm_alpha_hi,
1996 &xmm_alpha_lo, &xmm_alpha_hi);
1998 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1999 &xmm_alpha_lo, &xmm_alpha_hi,
2000 &xmm_dst_lo, &xmm_dst_hi);
2003 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2017 *pd++ = pack_1x64_32 (
2020 pix_multiply_1x64 (unpack_32_1x64 (m),
2021 expand_alpha_1x64 (unpack_32_1x64 (s)))));
2026 static force_inline void
2027 core_combine_out_ca_sse2 (uint32_t * pd,
2034 __m128i xmm_alpha_lo, xmm_alpha_hi;
2035 __m128i xmm_src_lo, xmm_src_hi;
2036 __m128i xmm_dst_lo, xmm_dst_hi;
2037 __m128i xmm_mask_lo, xmm_mask_hi;
2039 /* call prefetch hint to optimize cache load*/
2040 cache_prefetch ((__m128i*)ps);
2041 cache_prefetch ((__m128i*)pd);
2042 cache_prefetch ((__m128i*)pm);
2044 while (w && (unsigned long)pd & 15)
2050 *pd++ = pack_1x64_32 (
2053 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2054 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2058 /* call prefetch hint to optimize cache load*/
2059 cache_prefetch ((__m128i*)ps);
2060 cache_prefetch ((__m128i*)pd);
2061 cache_prefetch ((__m128i*)pm);
2065 /* fill cache line with next memory */
2066 cache_prefetch_next ((__m128i*)ps);
2067 cache_prefetch_next ((__m128i*)pd);
2068 cache_prefetch_next ((__m128i*)pm);
2070 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2071 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2072 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2074 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2075 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2076 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2078 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2079 &xmm_alpha_lo, &xmm_alpha_hi);
2080 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2081 &xmm_alpha_lo, &xmm_alpha_hi);
2083 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2084 &xmm_mask_lo, &xmm_mask_hi,
2085 &xmm_dst_lo, &xmm_dst_hi);
2086 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2087 &xmm_alpha_lo, &xmm_alpha_hi,
2088 &xmm_dst_lo, &xmm_dst_hi);
2091 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2105 *pd++ = pack_1x64_32 (
2108 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2109 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2115 static force_inline void
2116 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
2123 __m128i xmm_alpha_lo, xmm_alpha_hi;
2124 __m128i xmm_src_lo, xmm_src_hi;
2125 __m128i xmm_dst_lo, xmm_dst_hi;
2126 __m128i xmm_mask_lo, xmm_mask_hi;
2128 /* call prefetch hint to optimize cache load*/
2129 cache_prefetch ((__m128i*)ps);
2130 cache_prefetch ((__m128i*)pd);
2131 cache_prefetch ((__m128i*)pm);
2133 while (w && (unsigned long)pd & 15)
2139 *pd++ = pack_1x64_32 (
2142 negate_1x64 (pix_multiply_1x64 (
2144 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2148 /* call prefetch hint to optimize cache load*/
2149 cache_prefetch ((__m128i*)ps);
2150 cache_prefetch ((__m128i*)pd);
2151 cache_prefetch ((__m128i*)pm);
2155 /* fill cache line with next memory */
2156 cache_prefetch_next ((__m128i*)ps);
2157 cache_prefetch_next ((__m128i*)pd);
2158 cache_prefetch_next ((__m128i*)pm);
2160 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2164 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2168 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169 &xmm_alpha_lo, &xmm_alpha_hi);
2171 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2172 &xmm_alpha_lo, &xmm_alpha_hi,
2173 &xmm_mask_lo, &xmm_mask_hi);
2175 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2176 &xmm_mask_lo, &xmm_mask_hi);
2178 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2179 &xmm_mask_lo, &xmm_mask_hi,
2180 &xmm_dst_lo, &xmm_dst_hi);
2183 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2197 *pd++ = pack_1x64_32 (
2200 negate_1x64 (pix_multiply_1x64 (
2202 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2207 static force_inline uint32_t
2208 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2212 __m64 m = unpack_32_1x64 (mask);
2213 __m64 s = unpack_32_1x64 (src);
2214 __m64 d = unpack_32_1x64 (dst);
2215 __m64 sa = expand_alpha_1x64 (s);
2216 __m64 da = expand_alpha_1x64 (d);
2218 s = pix_multiply_1x64 (s, m);
2219 m = negate_1x64 (pix_multiply_1x64 (m, sa));
2221 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2224 static force_inline void
2225 core_combine_atop_ca_sse2 (uint32_t * pd,
2232 __m128i xmm_src_lo, xmm_src_hi;
2233 __m128i xmm_dst_lo, xmm_dst_hi;
2234 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2235 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2236 __m128i xmm_mask_lo, xmm_mask_hi;
2238 /* call prefetch hint to optimize cache load*/
2239 cache_prefetch ((__m128i*)ps);
2240 cache_prefetch ((__m128i*)pd);
2241 cache_prefetch ((__m128i*)pm);
2243 while (w && (unsigned long)pd & 15)
2249 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2253 /* call prefetch hint to optimize cache load*/
2254 cache_prefetch ((__m128i*)ps);
2255 cache_prefetch ((__m128i*)pd);
2256 cache_prefetch ((__m128i*)pm);
2260 /* fill cache line with next memory */
2261 cache_prefetch_next ((__m128i*)ps);
2262 cache_prefetch_next ((__m128i*)pd);
2263 cache_prefetch_next ((__m128i*)pm);
2265 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2266 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2267 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2269 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2270 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2271 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2273 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2274 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2275 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2276 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2278 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2279 &xmm_mask_lo, &xmm_mask_hi,
2280 &xmm_src_lo, &xmm_src_hi);
2281 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2282 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2283 &xmm_mask_lo, &xmm_mask_hi);
2285 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2287 pix_add_multiply_2x128 (
2288 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2289 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2290 &xmm_dst_lo, &xmm_dst_hi);
2293 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2307 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2312 static force_inline uint32_t
2313 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2317 __m64 m = unpack_32_1x64 (mask);
2318 __m64 s = unpack_32_1x64 (src);
2319 __m64 d = unpack_32_1x64 (dst);
2321 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2322 __m64 sa = expand_alpha_1x64 (s);
2324 s = pix_multiply_1x64 (s, m);
2325 m = pix_multiply_1x64 (m, sa);
2327 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2330 static force_inline void
2331 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2338 __m128i xmm_src_lo, xmm_src_hi;
2339 __m128i xmm_dst_lo, xmm_dst_hi;
2340 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2341 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2342 __m128i xmm_mask_lo, xmm_mask_hi;
2344 /* call prefetch hint to optimize cache load*/
2345 cache_prefetch ((__m128i*)ps);
2346 cache_prefetch ((__m128i*)pd);
2347 cache_prefetch ((__m128i*)pm);
2349 while (w && (unsigned long)pd & 15)
2355 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2359 /* call prefetch hint to optimize cache load*/
2360 cache_prefetch ((__m128i*)ps);
2361 cache_prefetch ((__m128i*)pd);
2362 cache_prefetch ((__m128i*)pm);
2366 /* fill cache line with next memory */
2367 cache_prefetch_next ((__m128i*)ps);
2368 cache_prefetch_next ((__m128i*)pd);
2369 cache_prefetch_next ((__m128i*)pm);
2371 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2372 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2373 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2375 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2376 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2377 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2379 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2380 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2381 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2382 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2384 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2385 &xmm_mask_lo, &xmm_mask_hi,
2386 &xmm_src_lo, &xmm_src_hi);
2387 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2388 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2389 &xmm_mask_lo, &xmm_mask_hi);
2391 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2392 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2394 pix_add_multiply_2x128 (
2395 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2396 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2397 &xmm_dst_lo, &xmm_dst_hi);
2400 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2414 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2419 static force_inline uint32_t
2420 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2424 __m64 a = unpack_32_1x64 (mask);
2425 __m64 s = unpack_32_1x64 (src);
2426 __m64 d = unpack_32_1x64 (dst);
2428 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2429 a, expand_alpha_1x64 (s)));
2430 __m64 dest = pix_multiply_1x64 (s, a);
2431 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2433 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2439 static force_inline void
2440 core_combine_xor_ca_sse2 (uint32_t * pd,
2447 __m128i xmm_src_lo, xmm_src_hi;
2448 __m128i xmm_dst_lo, xmm_dst_hi;
2449 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2450 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2451 __m128i xmm_mask_lo, xmm_mask_hi;
2453 /* call prefetch hint to optimize cache load*/
2454 cache_prefetch ((__m128i*)ps);
2455 cache_prefetch ((__m128i*)pd);
2456 cache_prefetch ((__m128i*)pm);
2458 while (w && (unsigned long)pd & 15)
2464 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2468 /* call prefetch hint to optimize cache load*/
2469 cache_prefetch ((__m128i*)ps);
2470 cache_prefetch ((__m128i*)pd);
2471 cache_prefetch ((__m128i*)pm);
2475 /* fill cache line with next memory */
2476 cache_prefetch_next ((__m128i*)ps);
2477 cache_prefetch_next ((__m128i*)pd);
2478 cache_prefetch_next ((__m128i*)pm);
2480 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2481 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2482 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2484 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2485 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2486 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2488 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2489 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2490 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2491 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2493 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2494 &xmm_mask_lo, &xmm_mask_hi,
2495 &xmm_src_lo, &xmm_src_hi);
2496 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2497 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2498 &xmm_mask_lo, &xmm_mask_hi);
2500 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2501 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2502 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2503 &xmm_mask_lo, &xmm_mask_hi);
2505 pix_add_multiply_2x128 (
2506 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2507 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2508 &xmm_dst_lo, &xmm_dst_hi);
2511 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2525 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2530 static force_inline void
2531 core_combine_add_ca_sse2 (uint32_t * pd,
2538 __m128i xmm_src_lo, xmm_src_hi;
2539 __m128i xmm_dst_lo, xmm_dst_hi;
2540 __m128i xmm_mask_lo, xmm_mask_hi;
2542 /* call prefetch hint to optimize cache load*/
2543 cache_prefetch ((__m128i*)ps);
2544 cache_prefetch ((__m128i*)pd);
2545 cache_prefetch ((__m128i*)pm);
2547 while (w && (unsigned long)pd & 15)
2553 *pd++ = pack_1x64_32 (
2554 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2555 unpack_32_1x64 (m)),
2556 unpack_32_1x64 (d)));
2560 /* call prefetch hint to optimize cache load*/
2561 cache_prefetch ((__m128i*)ps);
2562 cache_prefetch ((__m128i*)pd);
2563 cache_prefetch ((__m128i*)pm);
2567 /* fill cache line with next memory */
2568 cache_prefetch_next ((__m128i*)ps);
2569 cache_prefetch_next ((__m128i*)pd);
2570 cache_prefetch_next ((__m128i*)pm);
2572 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2573 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2574 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2576 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2577 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2578 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2580 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2581 &xmm_mask_lo, &xmm_mask_hi,
2582 &xmm_src_lo, &xmm_src_hi);
2585 (__m128i*)pd, pack_2x128_128 (
2586 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2587 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2601 *pd++ = pack_1x64_32 (
2602 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2603 unpack_32_1x64 (m)),
2604 unpack_32_1x64 (d)));
2609 /* ---------------------------------------------------
2610 * fb_compose_setup_sSE2
2612 static force_inline __m64
2613 create_mask_16_64 (uint16_t mask)
2615 return _mm_set1_pi16 (mask);
2618 static force_inline __m128i
2619 create_mask_16_128 (uint16_t mask)
2621 return _mm_set1_epi16 (mask);
2624 static force_inline __m64
2625 create_mask_2x32_64 (uint32_t mask0,
2628 return _mm_set_pi32 (mask0, mask1);
2631 /* Work around a code generation bug in Sun Studio 12. */
2632 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2633 # define create_mask_2x32_128(mask0, mask1) \
2634 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2636 static force_inline __m128i
2637 create_mask_2x32_128 (uint32_t mask0,
2640 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2644 /* SSE2 code patch for fbcompose.c */
2647 sse2_combine_over_u (pixman_implementation_t *imp,
2650 const uint32_t * src,
2651 const uint32_t * mask,
2654 core_combine_over_u_sse2 (dst, src, mask, width);
2659 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2662 const uint32_t * src,
2663 const uint32_t * mask,
2666 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2671 sse2_combine_in_u (pixman_implementation_t *imp,
2674 const uint32_t * src,
2675 const uint32_t * mask,
2678 core_combine_in_u_sse2 (dst, src, mask, width);
2683 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2686 const uint32_t * src,
2687 const uint32_t * mask,
2690 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2695 sse2_combine_out_u (pixman_implementation_t *imp,
2698 const uint32_t * src,
2699 const uint32_t * mask,
2702 core_combine_out_u_sse2 (dst, src, mask, width);
2707 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2710 const uint32_t * src,
2711 const uint32_t * mask,
2714 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2719 sse2_combine_atop_u (pixman_implementation_t *imp,
2722 const uint32_t * src,
2723 const uint32_t * mask,
2726 core_combine_atop_u_sse2 (dst, src, mask, width);
2731 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2734 const uint32_t * src,
2735 const uint32_t * mask,
2738 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2743 sse2_combine_xor_u (pixman_implementation_t *imp,
2746 const uint32_t * src,
2747 const uint32_t * mask,
2750 core_combine_xor_u_sse2 (dst, src, mask, width);
2755 sse2_combine_add_u (pixman_implementation_t *imp,
2758 const uint32_t * src,
2759 const uint32_t * mask,
2762 core_combine_add_u_sse2 (dst, src, mask, width);
2767 sse2_combine_saturate_u (pixman_implementation_t *imp,
2770 const uint32_t * src,
2771 const uint32_t * mask,
2774 core_combine_saturate_u_sse2 (dst, src, mask, width);
2779 sse2_combine_src_ca (pixman_implementation_t *imp,
2782 const uint32_t * src,
2783 const uint32_t * mask,
2786 core_combine_src_ca_sse2 (dst, src, mask, width);
2791 sse2_combine_over_ca (pixman_implementation_t *imp,
2794 const uint32_t * src,
2795 const uint32_t * mask,
2798 core_combine_over_ca_sse2 (dst, src, mask, width);
2803 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2806 const uint32_t * src,
2807 const uint32_t * mask,
2810 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2815 sse2_combine_in_ca (pixman_implementation_t *imp,
2818 const uint32_t * src,
2819 const uint32_t * mask,
2822 core_combine_in_ca_sse2 (dst, src, mask, width);
2827 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2830 const uint32_t * src,
2831 const uint32_t * mask,
2834 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2839 sse2_combine_out_ca (pixman_implementation_t *imp,
2842 const uint32_t * src,
2843 const uint32_t * mask,
2846 core_combine_out_ca_sse2 (dst, src, mask, width);
2851 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2854 const uint32_t * src,
2855 const uint32_t * mask,
2858 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2863 sse2_combine_atop_ca (pixman_implementation_t *imp,
2866 const uint32_t * src,
2867 const uint32_t * mask,
2870 core_combine_atop_ca_sse2 (dst, src, mask, width);
2875 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2878 const uint32_t * src,
2879 const uint32_t * mask,
2882 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2887 sse2_combine_xor_ca (pixman_implementation_t *imp,
2890 const uint32_t * src,
2891 const uint32_t * mask,
2894 core_combine_xor_ca_sse2 (dst, src, mask, width);
2899 sse2_combine_add_ca (pixman_implementation_t *imp,
2902 const uint32_t * src,
2903 const uint32_t * mask,
2906 core_combine_add_ca_sse2 (dst, src, mask, width);
2910 /* -------------------------------------------------------------------
2911 * composite_over_n_8888
2915 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2917 pixman_image_t * src_image,
2918 pixman_image_t * mask_image,
2919 pixman_image_t * dst_image,
2930 uint32_t *dst_line, *dst, d;
2933 __m128i xmm_src, xmm_alpha;
2934 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2936 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2941 PIXMAN_IMAGE_GET_LINE (
2942 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2944 xmm_src = expand_pixel_32_1x128 (src);
2945 xmm_alpha = expand_alpha_1x128 (xmm_src);
2951 /* call prefetch hint to optimize cache load*/
2952 cache_prefetch ((__m128i*)dst);
2954 dst_line += dst_stride;
2957 while (w && (unsigned long)dst & 15)
2960 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2961 _mm_movepi64_pi64 (xmm_alpha),
2962 unpack_32_1x64 (d)));
2966 cache_prefetch ((__m128i*)dst);
2970 /* fill cache line with next memory */
2971 cache_prefetch_next ((__m128i*)dst);
2973 xmm_dst = load_128_aligned ((__m128i*)dst);
2975 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2977 over_2x128 (&xmm_src, &xmm_src,
2978 &xmm_alpha, &xmm_alpha,
2979 &xmm_dst_lo, &xmm_dst_hi);
2981 /* rebuid the 4 pixel data and save*/
2983 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2992 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2993 _mm_movepi64_pi64 (xmm_alpha),
2994 unpack_32_1x64 (d)));
3002 /* ---------------------------------------------------------------------
3003 * composite_over_n_0565
3006 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3008 pixman_image_t * src_image,
3009 pixman_image_t * mask_image,
3010 pixman_image_t * dst_image,
3021 uint16_t *dst_line, *dst, d;
3024 __m128i xmm_src, xmm_alpha;
3025 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3027 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3032 PIXMAN_IMAGE_GET_LINE (
3033 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3035 xmm_src = expand_pixel_32_1x128 (src);
3036 xmm_alpha = expand_alpha_1x128 (xmm_src);
3042 /* call prefetch hint to optimize cache load*/
3043 cache_prefetch ((__m128i*)dst);
3045 dst_line += dst_stride;
3048 while (w && (unsigned long)dst & 15)
3052 *dst++ = pack_565_32_16 (
3053 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3054 _mm_movepi64_pi64 (xmm_alpha),
3055 expand565_16_1x64 (d))));
3059 /* call prefetch hint to optimize cache load*/
3060 cache_prefetch ((__m128i*)dst);
3064 /* fill cache line with next memory */
3065 cache_prefetch_next ((__m128i*)dst);
3067 xmm_dst = load_128_aligned ((__m128i*)dst);
3069 unpack_565_128_4x128 (xmm_dst,
3070 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3072 over_2x128 (&xmm_src, &xmm_src,
3073 &xmm_alpha, &xmm_alpha,
3074 &xmm_dst0, &xmm_dst1);
3075 over_2x128 (&xmm_src, &xmm_src,
3076 &xmm_alpha, &xmm_alpha,
3077 &xmm_dst2, &xmm_dst3);
3079 xmm_dst = pack_565_4x128_128 (
3080 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3082 save_128_aligned ((__m128i*)dst, xmm_dst);
3091 *dst++ = pack_565_32_16 (
3092 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3093 _mm_movepi64_pi64 (xmm_alpha),
3094 expand565_16_1x64 (d))));
3101 /* ------------------------------
3102 * composite_add_n_8888_8888_ca
3105 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3107 pixman_image_t * src_image,
3108 pixman_image_t * mask_image,
3109 pixman_image_t * dst_image,
3120 uint32_t *dst_line, d;
3121 uint32_t *mask_line, m;
3123 int dst_stride, mask_stride;
3125 __m128i xmm_src, xmm_alpha;
3127 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3129 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3131 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3137 PIXMAN_IMAGE_GET_LINE (
3138 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3139 PIXMAN_IMAGE_GET_LINE (
3140 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3142 xmm_src = _mm_unpacklo_epi8 (
3143 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3144 xmm_alpha = expand_alpha_1x128 (xmm_src);
3145 mmx_src = _mm_movepi64_pi64 (xmm_src);
3146 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3151 const uint32_t *pm = (uint32_t *)mask_line;
3152 uint32_t *pd = (uint32_t *)dst_line;
3154 dst_line += dst_stride;
3155 mask_line += mask_stride;
3157 /* call prefetch hint to optimize cache load*/
3158 cache_prefetch ((__m128i*)pd);
3159 cache_prefetch ((__m128i*)pm);
3161 while (w && (unsigned long)pd & 15)
3169 mmx_mask = unpack_32_1x64 (m);
3170 mmx_dest = unpack_32_1x64 (d);
3172 *pd = pack_1x64_32 (
3173 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3180 /* call prefetch hint to optimize cache load*/
3181 cache_prefetch ((__m128i*)pd);
3182 cache_prefetch ((__m128i*)pm);
3186 /* fill cache line with next memory */
3187 cache_prefetch_next ((__m128i*)pd);
3188 cache_prefetch_next ((__m128i*)pm);
3190 xmm_mask = load_128_unaligned ((__m128i*)pm);
3194 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3196 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3197 if (pack_cmp != 0xffff)
3199 xmm_dst = load_128_aligned ((__m128i*)pd);
3201 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3203 pix_multiply_2x128 (&xmm_src, &xmm_src,
3204 &xmm_mask_lo, &xmm_mask_hi,
3205 &xmm_mask_lo, &xmm_mask_hi);
3206 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3209 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3225 mmx_mask = unpack_32_1x64 (m);
3226 mmx_dest = unpack_32_1x64 (d);
3228 *pd = pack_1x64_32 (
3229 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3240 /* ---------------------------------------------------------------------------
3241 * composite_over_n_8888_8888_ca
3245 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3247 pixman_image_t * src_image,
3248 pixman_image_t * mask_image,
3249 pixman_image_t * dst_image,
3260 uint32_t *dst_line, d;
3261 uint32_t *mask_line, m;
3263 int dst_stride, mask_stride;
3265 __m128i xmm_src, xmm_alpha;
3266 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3267 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3269 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3271 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3276 PIXMAN_IMAGE_GET_LINE (
3277 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3278 PIXMAN_IMAGE_GET_LINE (
3279 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3281 xmm_src = _mm_unpacklo_epi8 (
3282 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3283 xmm_alpha = expand_alpha_1x128 (xmm_src);
3284 mmx_src = _mm_movepi64_pi64 (xmm_src);
3285 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3290 const uint32_t *pm = (uint32_t *)mask_line;
3291 uint32_t *pd = (uint32_t *)dst_line;
3293 dst_line += dst_stride;
3294 mask_line += mask_stride;
3296 /* call prefetch hint to optimize cache load*/
3297 cache_prefetch ((__m128i*)pd);
3298 cache_prefetch ((__m128i*)pm);
3300 while (w && (unsigned long)pd & 15)
3307 mmx_mask = unpack_32_1x64 (m);
3308 mmx_dest = unpack_32_1x64 (d);
3310 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3320 /* call prefetch hint to optimize cache load*/
3321 cache_prefetch ((__m128i*)pd);
3322 cache_prefetch ((__m128i*)pm);
3326 /* fill cache line with next memory */
3327 cache_prefetch_next ((__m128i*)pd);
3328 cache_prefetch_next ((__m128i*)pm);
3330 xmm_mask = load_128_unaligned ((__m128i*)pm);
3334 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3336 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3337 if (pack_cmp != 0xffff)
3339 xmm_dst = load_128_aligned ((__m128i*)pd);
3341 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3342 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3344 in_over_2x128 (&xmm_src, &xmm_src,
3345 &xmm_alpha, &xmm_alpha,
3346 &xmm_mask_lo, &xmm_mask_hi,
3347 &xmm_dst_lo, &xmm_dst_hi);
3350 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3365 mmx_mask = unpack_32_1x64 (m);
3366 mmx_dest = unpack_32_1x64 (d);
3368 *pd = pack_1x64_32 (
3369 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3380 /*---------------------------------------------------------------------
3381 * composite_over_8888_n_8888
3385 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3387 pixman_image_t * src_image,
3388 pixman_image_t * mask_image,
3389 pixman_image_t * dst_image,
3399 uint32_t *dst_line, *dst;
3400 uint32_t *src_line, *src;
3403 int dst_stride, src_stride;
3406 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3407 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3408 __m128i xmm_alpha_lo, xmm_alpha_hi;
3410 PIXMAN_IMAGE_GET_LINE (
3411 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3412 PIXMAN_IMAGE_GET_LINE (
3413 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3415 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3417 xmm_mask = create_mask_16_128 (mask >> 24);
3422 dst_line += dst_stride;
3424 src_line += src_stride;
3427 /* call prefetch hint to optimize cache load*/
3428 cache_prefetch ((__m128i*)dst);
3429 cache_prefetch ((__m128i*)src);
3431 while (w && (unsigned long)dst & 15)
3433 uint32_t s = *src++;
3436 __m64 ms = unpack_32_1x64 (s);
3437 __m64 alpha = expand_alpha_1x64 (ms);
3438 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3439 __m64 alpha_dst = unpack_32_1x64 (d);
3441 *dst++ = pack_1x64_32 (
3442 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3447 /* call prefetch hint to optimize cache load*/
3448 cache_prefetch ((__m128i*)dst);
3449 cache_prefetch ((__m128i*)src);
3453 /* fill cache line with next memory */
3454 cache_prefetch_next ((__m128i*)dst);
3455 cache_prefetch_next ((__m128i*)src);
3457 xmm_src = load_128_unaligned ((__m128i*)src);
3458 xmm_dst = load_128_aligned ((__m128i*)dst);
3460 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3461 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3462 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3463 &xmm_alpha_lo, &xmm_alpha_hi);
3465 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3466 &xmm_alpha_lo, &xmm_alpha_hi,
3467 &xmm_mask, &xmm_mask,
3468 &xmm_dst_lo, &xmm_dst_hi);
3471 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3480 uint32_t s = *src++;
3483 __m64 ms = unpack_32_1x64 (s);
3484 __m64 alpha = expand_alpha_1x64 (ms);
3485 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3486 __m64 dest = unpack_32_1x64 (d);
3488 *dst++ = pack_1x64_32 (
3489 in_over_1x64 (&ms, &alpha, &mask, &dest));
3498 /* ---------------------------------------------------------------------
3499 * composite_over_x888_n_8888
3502 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3504 pixman_image_t * src_image,
3505 pixman_image_t * mask_image,
3506 pixman_image_t * dst_image,
3516 uint32_t *dst_line, *dst;
3517 uint32_t *src_line, *src;
3519 int dst_stride, src_stride;
3522 __m128i xmm_mask, xmm_alpha;
3523 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3524 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3526 PIXMAN_IMAGE_GET_LINE (
3527 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3528 PIXMAN_IMAGE_GET_LINE (
3529 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3531 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3533 xmm_mask = create_mask_16_128 (mask >> 24);
3534 xmm_alpha = mask_00ff;
3539 dst_line += dst_stride;
3541 src_line += src_stride;
3544 /* call prefetch hint to optimize cache load*/
3545 cache_prefetch ((__m128i*)dst);
3546 cache_prefetch ((__m128i*)src);
3548 while (w && (unsigned long)dst & 15)
3550 uint32_t s = (*src++) | 0xff000000;
3553 __m64 src = unpack_32_1x64 (s);
3554 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3555 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3556 __m64 dest = unpack_32_1x64 (d);
3558 *dst++ = pack_1x64_32 (
3559 in_over_1x64 (&src, &alpha, &mask, &dest));
3564 /* call prefetch hint to optimize cache load*/
3565 cache_prefetch ((__m128i*)dst);
3566 cache_prefetch ((__m128i*)src);
3570 /* fill cache line with next memory */
3571 cache_prefetch_next ((__m128i*)dst);
3572 cache_prefetch_next ((__m128i*)src);
3574 xmm_src = _mm_or_si128 (
3575 load_128_unaligned ((__m128i*)src), mask_ff000000);
3576 xmm_dst = load_128_aligned ((__m128i*)dst);
3578 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3579 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3581 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3582 &xmm_alpha, &xmm_alpha,
3583 &xmm_mask, &xmm_mask,
3584 &xmm_dst_lo, &xmm_dst_hi);
3587 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3597 uint32_t s = (*src++) | 0xff000000;
3600 __m64 src = unpack_32_1x64 (s);
3601 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3602 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3603 __m64 dest = unpack_32_1x64 (d);
3605 *dst++ = pack_1x64_32 (
3606 in_over_1x64 (&src, &alpha, &mask, &dest));
3615 /* --------------------------------------------------------------------
3616 * composite_over_8888_8888
3619 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3621 pixman_image_t * src_image,
3622 pixman_image_t * mask_image,
3623 pixman_image_t * dst_image,
3633 int dst_stride, src_stride;
3634 uint32_t *dst_line, *dst;
3635 uint32_t *src_line, *src;
3637 PIXMAN_IMAGE_GET_LINE (
3638 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3639 PIXMAN_IMAGE_GET_LINE (
3640 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3647 core_combine_over_u_sse2 (dst, src, NULL, width);
3655 /* ------------------------------------------------------------------
3656 * composite_over_8888_0565
3658 static force_inline uint16_t
3659 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3663 ms = unpack_32_1x64 (src);
3664 return pack_565_32_16 (
3667 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3671 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3673 pixman_image_t * src_image,
3674 pixman_image_t * mask_image,
3675 pixman_image_t * dst_image,
3685 uint16_t *dst_line, *dst, d;
3686 uint32_t *src_line, *src, s;
3687 int dst_stride, src_stride;
3690 __m128i xmm_alpha_lo, xmm_alpha_hi;
3691 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3692 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3694 PIXMAN_IMAGE_GET_LINE (
3695 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3696 PIXMAN_IMAGE_GET_LINE (
3697 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3702 * I copy the code from MMX one and keep the fixme.
3703 * If it's a problem there, probably is a problem here.
3705 assert (src_image->drawable == mask_image->drawable);
3713 /* call prefetch hint to optimize cache load*/
3714 cache_prefetch ((__m128i*)src);
3715 cache_prefetch ((__m128i*)dst);
3717 dst_line += dst_stride;
3718 src_line += src_stride;
3721 /* Align dst on a 16-byte boundary */
3723 ((unsigned long)dst & 15))
3728 *dst++ = composite_over_8888_0565pixel (s, d);
3732 /* call prefetch hint to optimize cache load*/
3733 cache_prefetch ((__m128i*)src);
3734 cache_prefetch ((__m128i*)dst);
3736 /* It's a 8 pixel loop */
3739 /* fill cache line with next memory */
3740 cache_prefetch_next ((__m128i*)src);
3741 cache_prefetch_next ((__m128i*)dst);
3743 /* I'm loading unaligned because I'm not sure
3744 * about the address alignment.
3746 xmm_src = load_128_unaligned ((__m128i*) src);
3747 xmm_dst = load_128_aligned ((__m128i*) dst);
3750 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3751 unpack_565_128_4x128 (xmm_dst,
3752 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3753 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3754 &xmm_alpha_lo, &xmm_alpha_hi);
3756 /* I'm loading next 4 pixels from memory
3757 * before to optimze the memory read.
3759 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3761 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3762 &xmm_alpha_lo, &xmm_alpha_hi,
3763 &xmm_dst0, &xmm_dst1);
3766 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3767 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3768 &xmm_alpha_lo, &xmm_alpha_hi);
3770 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3771 &xmm_alpha_lo, &xmm_alpha_hi,
3772 &xmm_dst2, &xmm_dst3);
3775 (__m128i*)dst, pack_565_4x128_128 (
3776 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3788 *dst++ = composite_over_8888_0565pixel (s, d);
3795 /* -----------------------------------------------------------------
3796 * composite_over_n_8_8888
3800 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3802 pixman_image_t * src_image,
3803 pixman_image_t * mask_image,
3804 pixman_image_t * dst_image,
3815 uint32_t *dst_line, *dst;
3816 uint8_t *mask_line, *mask;
3817 int dst_stride, mask_stride;
3821 __m128i xmm_src, xmm_alpha, xmm_def;
3822 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3823 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3825 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3827 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3833 PIXMAN_IMAGE_GET_LINE (
3834 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3835 PIXMAN_IMAGE_GET_LINE (
3836 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3838 xmm_def = create_mask_2x32_128 (src, src);
3839 xmm_src = expand_pixel_32_1x128 (src);
3840 xmm_alpha = expand_alpha_1x128 (xmm_src);
3841 mmx_src = _mm_movepi64_pi64 (xmm_src);
3842 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3847 dst_line += dst_stride;
3849 mask_line += mask_stride;
3852 /* call prefetch hint to optimize cache load*/
3853 cache_prefetch ((__m128i*)mask);
3854 cache_prefetch ((__m128i*)dst);
3856 while (w && (unsigned long)dst & 15)
3858 uint8_t m = *mask++;
3863 mmx_mask = expand_pixel_8_1x64 (m);
3864 mmx_dest = unpack_32_1x64 (d);
3866 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3876 /* call prefetch hint to optimize cache load*/
3877 cache_prefetch ((__m128i*)mask);
3878 cache_prefetch ((__m128i*)dst);
3882 /* fill cache line with next memory */
3883 cache_prefetch_next ((__m128i*)mask);
3884 cache_prefetch_next ((__m128i*)dst);
3886 m = *((uint32_t*)mask);
3888 if (srca == 0xff && m == 0xffffffff)
3890 save_128_aligned ((__m128i*)dst, xmm_def);
3894 xmm_dst = load_128_aligned ((__m128i*) dst);
3895 xmm_mask = unpack_32_1x128 (m);
3896 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3899 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3900 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3902 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3903 &xmm_mask_lo, &xmm_mask_hi);
3905 in_over_2x128 (&xmm_src, &xmm_src,
3906 &xmm_alpha, &xmm_alpha,
3907 &xmm_mask_lo, &xmm_mask_hi,
3908 &xmm_dst_lo, &xmm_dst_hi);
3911 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3921 uint8_t m = *mask++;
3926 mmx_mask = expand_pixel_8_1x64 (m);
3927 mmx_dest = unpack_32_1x64 (d);
3929 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3943 /* ----------------------------------------------------------------
3944 * composite_over_n_8_8888
3948 pixman_fill_sse2 (uint32_t *bits,
3957 uint32_t byte_width;
3962 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3965 if (bpp != 16 && bpp != 32)
3970 stride = stride * (int) sizeof (uint32_t) / 2;
3971 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3972 byte_width = 2 * width;
3977 stride = stride * (int) sizeof (uint32_t) / 4;
3978 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3979 byte_width = 4 * width;
3983 cache_prefetch ((__m128i*)byte_line);
3984 xmm_def = create_mask_2x32_128 (data, data);
3989 uint8_t *d = byte_line;
3990 byte_line += stride;
3994 cache_prefetch_next ((__m128i*)d);
3996 while (w >= 2 && ((unsigned long)d & 3))
3998 *(uint16_t *)d = data;
4003 while (w >= 4 && ((unsigned long)d & 15))
4005 *(uint32_t *)d = data;
4011 cache_prefetch_next ((__m128i*)d);
4015 cache_prefetch (((__m128i*)d) + 12);
4017 save_128_aligned ((__m128i*)(d), xmm_def);
4018 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4019 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4020 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4021 save_128_aligned ((__m128i*)(d + 64), xmm_def);
4022 save_128_aligned ((__m128i*)(d + 80), xmm_def);
4023 save_128_aligned ((__m128i*)(d + 96), xmm_def);
4024 save_128_aligned ((__m128i*)(d + 112), xmm_def);
4032 cache_prefetch (((__m128i*)d) + 8);
4034 save_128_aligned ((__m128i*)(d), xmm_def);
4035 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4036 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4037 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4043 cache_prefetch_next ((__m128i*)d);
4047 save_128_aligned ((__m128i*)(d), xmm_def);
4048 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4056 save_128_aligned ((__m128i*)(d), xmm_def);
4062 cache_prefetch_next ((__m128i*)d);
4066 *(uint32_t *)d = data;
4074 *(uint16_t *)d = data;
4085 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4087 pixman_image_t * src_image,
4088 pixman_image_t * mask_image,
4089 pixman_image_t * dst_image,
4100 uint32_t *dst_line, *dst;
4101 uint8_t *mask_line, *mask;
4102 int dst_stride, mask_stride;
4106 __m128i xmm_src, xmm_def;
4107 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4109 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4114 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4115 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4116 dest_x, dest_y, width, height, 0);
4120 PIXMAN_IMAGE_GET_LINE (
4121 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4122 PIXMAN_IMAGE_GET_LINE (
4123 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4125 xmm_def = create_mask_2x32_128 (src, src);
4126 xmm_src = expand_pixel_32_1x128 (src);
4131 dst_line += dst_stride;
4133 mask_line += mask_stride;
4136 /* call prefetch hint to optimize cache load*/
4137 cache_prefetch ((__m128i*)mask);
4138 cache_prefetch ((__m128i*)dst);
4140 while (w && (unsigned long)dst & 15)
4142 uint8_t m = *mask++;
4146 *dst = pack_1x64_32 (
4148 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4159 /* call prefetch hint to optimize cache load*/
4160 cache_prefetch ((__m128i*)mask);
4161 cache_prefetch ((__m128i*)dst);
4165 /* fill cache line with next memory */
4166 cache_prefetch_next ((__m128i*)mask);
4167 cache_prefetch_next ((__m128i*)dst);
4169 m = *((uint32_t*)mask);
4171 if (srca == 0xff && m == 0xffffffff)
4173 save_128_aligned ((__m128i*)dst, xmm_def);
4177 xmm_mask = unpack_32_1x128 (m);
4178 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4181 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4183 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4184 &xmm_mask_lo, &xmm_mask_hi);
4186 pix_multiply_2x128 (&xmm_src, &xmm_src,
4187 &xmm_mask_lo, &xmm_mask_hi,
4188 &xmm_mask_lo, &xmm_mask_hi);
4191 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4195 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4205 uint8_t m = *mask++;
4209 *dst = pack_1x64_32 (
4211 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4226 /*-----------------------------------------------------------------------
4227 * composite_over_n_8_0565
4231 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4233 pixman_image_t * src_image,
4234 pixman_image_t * mask_image,
4235 pixman_image_t * dst_image,
4246 uint16_t *dst_line, *dst, d;
4247 uint8_t *mask_line, *mask;
4248 int dst_stride, mask_stride;
4251 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4253 __m128i xmm_src, xmm_alpha;
4254 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4255 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4257 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4263 PIXMAN_IMAGE_GET_LINE (
4264 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4265 PIXMAN_IMAGE_GET_LINE (
4266 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4268 xmm_src = expand_pixel_32_1x128 (src);
4269 xmm_alpha = expand_alpha_1x128 (xmm_src);
4270 mmx_src = _mm_movepi64_pi64 (xmm_src);
4271 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4276 dst_line += dst_stride;
4278 mask_line += mask_stride;
4281 /* call prefetch hint to optimize cache load*/
4282 cache_prefetch ((__m128i*)mask);
4283 cache_prefetch ((__m128i*)dst);
4285 while (w && (unsigned long)dst & 15)
4292 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4293 mmx_dest = expand565_16_1x64 (d);
4295 *dst = pack_565_32_16 (
4298 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4305 /* call prefetch hint to optimize cache load*/
4306 cache_prefetch ((__m128i*)mask);
4307 cache_prefetch ((__m128i*)dst);
4311 /* fill cache line with next memory */
4312 cache_prefetch_next ((__m128i*)mask);
4313 cache_prefetch_next ((__m128i*)dst);
4315 xmm_dst = load_128_aligned ((__m128i*) dst);
4316 unpack_565_128_4x128 (xmm_dst,
4317 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4319 m = *((uint32_t*)mask);
4324 xmm_mask = unpack_32_1x128 (m);
4325 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4328 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4330 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4331 &xmm_mask_lo, &xmm_mask_hi);
4333 in_over_2x128 (&xmm_src, &xmm_src,
4334 &xmm_alpha, &xmm_alpha,
4335 &xmm_mask_lo, &xmm_mask_hi,
4336 &xmm_dst0, &xmm_dst1);
4339 m = *((uint32_t*)mask);
4344 xmm_mask = unpack_32_1x128 (m);
4345 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4348 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4350 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4351 &xmm_mask_lo, &xmm_mask_hi);
4352 in_over_2x128 (&xmm_src, &xmm_src,
4353 &xmm_alpha, &xmm_alpha,
4354 &xmm_mask_lo, &xmm_mask_hi,
4355 &xmm_dst2, &xmm_dst3);
4359 (__m128i*)dst, pack_565_4x128_128 (
4360 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4373 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4374 mmx_dest = expand565_16_1x64 (d);
4376 *dst = pack_565_32_16 (
4379 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4390 /* -----------------------------------------------------------------------
4391 * composite_over_pixbuf_0565
4395 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4397 pixman_image_t * src_image,
4398 pixman_image_t * mask_image,
4399 pixman_image_t * dst_image,
4409 uint16_t *dst_line, *dst, d;
4410 uint32_t *src_line, *src, s;
4411 int dst_stride, src_stride;
4413 uint32_t opaque, zero;
4416 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4417 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4419 PIXMAN_IMAGE_GET_LINE (
4420 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4421 PIXMAN_IMAGE_GET_LINE (
4422 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4427 * I copy the code from MMX one and keep the fixme.
4428 * If it's a problem there, probably is a problem here.
4430 assert (src_image->drawable == mask_image->drawable);
4436 dst_line += dst_stride;
4438 src_line += src_stride;
4441 /* call prefetch hint to optimize cache load*/
4442 cache_prefetch ((__m128i*)src);
4443 cache_prefetch ((__m128i*)dst);
4445 while (w && (unsigned long)dst & 15)
4450 ms = unpack_32_1x64 (s);
4452 *dst++ = pack_565_32_16 (
4454 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4458 /* call prefetch hint to optimize cache load*/
4459 cache_prefetch ((__m128i*)src);
4460 cache_prefetch ((__m128i*)dst);
4464 /* fill cache line with next memory */
4465 cache_prefetch_next ((__m128i*)src);
4466 cache_prefetch_next ((__m128i*)dst);
4469 xmm_src = load_128_unaligned ((__m128i*)src);
4470 xmm_dst = load_128_aligned ((__m128i*)dst);
4472 opaque = is_opaque (xmm_src);
4473 zero = is_zero (xmm_src);
4475 unpack_565_128_4x128 (xmm_dst,
4476 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4477 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4479 /* preload next round*/
4480 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4484 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4485 &xmm_dst0, &xmm_dst1);
4489 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4490 &xmm_dst0, &xmm_dst1);
4494 opaque = is_opaque (xmm_src);
4495 zero = is_zero (xmm_src);
4497 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4501 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4502 &xmm_dst2, &xmm_dst3);
4506 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4507 &xmm_dst2, &xmm_dst3);
4511 (__m128i*)dst, pack_565_4x128_128 (
4512 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4524 ms = unpack_32_1x64 (s);
4526 *dst++ = pack_565_32_16 (
4528 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4536 /* -------------------------------------------------------------------------
4537 * composite_over_pixbuf_8888
4541 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4543 pixman_image_t * src_image,
4544 pixman_image_t * mask_image,
4545 pixman_image_t * dst_image,
4555 uint32_t *dst_line, *dst, d;
4556 uint32_t *src_line, *src, s;
4557 int dst_stride, src_stride;
4559 uint32_t opaque, zero;
4561 __m128i xmm_src_lo, xmm_src_hi;
4562 __m128i xmm_dst_lo, xmm_dst_hi;
4564 PIXMAN_IMAGE_GET_LINE (
4565 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4566 PIXMAN_IMAGE_GET_LINE (
4567 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4572 * I copy the code from MMX one and keep the fixme.
4573 * If it's a problem there, probably is a problem here.
4575 assert (src_image->drawable == mask_image->drawable);
4581 dst_line += dst_stride;
4583 src_line += src_stride;
4586 /* call prefetch hint to optimize cache load*/
4587 cache_prefetch ((__m128i*)src);
4588 cache_prefetch ((__m128i*)dst);
4590 while (w && (unsigned long)dst & 15)
4595 *dst++ = pack_1x64_32 (
4596 over_rev_non_pre_1x64 (
4597 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4602 /* call prefetch hint to optimize cache load*/
4603 cache_prefetch ((__m128i*)src);
4604 cache_prefetch ((__m128i*)dst);
4608 /* fill cache line with next memory */
4609 cache_prefetch_next ((__m128i*)src);
4610 cache_prefetch_next ((__m128i*)dst);
4612 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4614 opaque = is_opaque (xmm_src_hi);
4615 zero = is_zero (xmm_src_hi);
4617 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4621 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4622 &xmm_dst_lo, &xmm_dst_hi);
4625 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4629 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4631 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4633 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4634 &xmm_dst_lo, &xmm_dst_hi);
4637 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4650 *dst++ = pack_1x64_32 (
4651 over_rev_non_pre_1x64 (
4652 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4661 /* -------------------------------------------------------------------------------------------------
4662 * composite_over_n_8888_0565_ca
4666 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4668 pixman_image_t * src_image,
4669 pixman_image_t * mask_image,
4670 pixman_image_t * dst_image,
4681 uint16_t *dst_line, *dst, d;
4682 uint32_t *mask_line, *mask, m;
4683 int dst_stride, mask_stride;
4687 __m128i xmm_src, xmm_alpha;
4688 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4689 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4691 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4693 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4698 PIXMAN_IMAGE_GET_LINE (
4699 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4700 PIXMAN_IMAGE_GET_LINE (
4701 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4703 xmm_src = expand_pixel_32_1x128 (src);
4704 xmm_alpha = expand_alpha_1x128 (xmm_src);
4705 mmx_src = _mm_movepi64_pi64 (xmm_src);
4706 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4713 mask_line += mask_stride;
4714 dst_line += dst_stride;
4716 /* call prefetch hint to optimize cache load*/
4717 cache_prefetch ((__m128i*)mask);
4718 cache_prefetch ((__m128i*)dst);
4720 while (w && ((unsigned long)dst & 15))
4722 m = *(uint32_t *) mask;
4727 mmx_mask = unpack_32_1x64 (m);
4728 mmx_dest = expand565_16_1x64 (d);
4730 *dst = pack_565_32_16 (
4733 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4741 /* call prefetch hint to optimize cache load*/
4742 cache_prefetch ((__m128i*)mask);
4743 cache_prefetch ((__m128i*)dst);
4747 /* fill cache line with next memory */
4748 cache_prefetch_next ((__m128i*)mask);
4749 cache_prefetch_next ((__m128i*)dst);
4752 xmm_mask = load_128_unaligned ((__m128i*)mask);
4753 xmm_dst = load_128_aligned ((__m128i*)dst);
4755 pack_cmp = _mm_movemask_epi8 (
4756 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4758 unpack_565_128_4x128 (xmm_dst,
4759 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4760 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4762 /* preload next round */
4763 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4765 /* preload next round */
4766 if (pack_cmp != 0xffff)
4768 in_over_2x128 (&xmm_src, &xmm_src,
4769 &xmm_alpha, &xmm_alpha,
4770 &xmm_mask_lo, &xmm_mask_hi,
4771 &xmm_dst0, &xmm_dst1);
4775 pack_cmp = _mm_movemask_epi8 (
4776 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4778 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4780 if (pack_cmp != 0xffff)
4782 in_over_2x128 (&xmm_src, &xmm_src,
4783 &xmm_alpha, &xmm_alpha,
4784 &xmm_mask_lo, &xmm_mask_hi,
4785 &xmm_dst2, &xmm_dst3);
4789 (__m128i*)dst, pack_565_4x128_128 (
4790 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4799 m = *(uint32_t *) mask;
4804 mmx_mask = unpack_32_1x64 (m);
4805 mmx_dest = expand565_16_1x64 (d);
4807 *dst = pack_565_32_16 (
4810 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4822 /* -----------------------------------------------------------------------
4823 * composite_in_n_8_8
4827 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4829 pixman_image_t * src_image,
4830 pixman_image_t * mask_image,
4831 pixman_image_t * dst_image,
4841 uint8_t *dst_line, *dst;
4842 uint8_t *mask_line, *mask;
4843 int dst_stride, mask_stride;
4849 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4850 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4852 PIXMAN_IMAGE_GET_LINE (
4853 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4854 PIXMAN_IMAGE_GET_LINE (
4855 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4857 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4861 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4866 dst_line += dst_stride;
4868 mask_line += mask_stride;
4871 /* call prefetch hint to optimize cache load*/
4872 cache_prefetch ((__m128i*)mask);
4873 cache_prefetch ((__m128i*)dst);
4875 while (w && ((unsigned long)dst & 15))
4877 m = (uint32_t) *mask++;
4878 d = (uint32_t) *dst;
4880 *dst++ = (uint8_t) pack_1x64_32 (
4882 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4883 unpack_32_1x64 (m)),
4884 unpack_32_1x64 (d)));
4888 /* call prefetch hint to optimize cache load*/
4889 cache_prefetch ((__m128i*)mask);
4890 cache_prefetch ((__m128i*)dst);
4894 /* fill cache line with next memory */
4895 cache_prefetch_next ((__m128i*)mask);
4896 cache_prefetch_next ((__m128i*)dst);
4898 xmm_mask = load_128_unaligned ((__m128i*)mask);
4899 xmm_dst = load_128_aligned ((__m128i*)dst);
4901 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4902 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4904 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4905 &xmm_mask_lo, &xmm_mask_hi,
4906 &xmm_mask_lo, &xmm_mask_hi);
4908 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4909 &xmm_dst_lo, &xmm_dst_hi,
4910 &xmm_dst_lo, &xmm_dst_hi);
4913 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4922 m = (uint32_t) *mask++;
4923 d = (uint32_t) *dst;
4925 *dst++ = (uint8_t) pack_1x64_32 (
4928 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4929 unpack_32_1x64 (d)));
4937 /* ---------------------------------------------------------------------------
4942 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4944 pixman_image_t * src_image,
4945 pixman_image_t * mask_image,
4946 pixman_image_t * dst_image,
4956 uint8_t *dst_line, *dst;
4957 uint8_t *src_line, *src;
4958 int src_stride, dst_stride;
4962 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4963 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4965 PIXMAN_IMAGE_GET_LINE (
4966 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4967 PIXMAN_IMAGE_GET_LINE (
4968 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4973 dst_line += dst_stride;
4975 src_line += src_stride;
4978 /* call prefetch hint to optimize cache load*/
4979 cache_prefetch ((__m128i*)src);
4980 cache_prefetch ((__m128i*)dst);
4982 while (w && ((unsigned long)dst & 15))
4984 s = (uint32_t) *src++;
4985 d = (uint32_t) *dst;
4987 *dst++ = (uint8_t) pack_1x64_32 (
4989 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4993 /* call prefetch hint to optimize cache load*/
4994 cache_prefetch ((__m128i*)src);
4995 cache_prefetch ((__m128i*)dst);
4999 /* fill cache line with next memory */
5000 cache_prefetch_next ((__m128i*)src);
5001 cache_prefetch_next ((__m128i*)dst);
5003 xmm_src = load_128_unaligned ((__m128i*)src);
5004 xmm_dst = load_128_aligned ((__m128i*)dst);
5006 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5007 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5009 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5010 &xmm_dst_lo, &xmm_dst_hi,
5011 &xmm_dst_lo, &xmm_dst_hi);
5014 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5023 s = (uint32_t) *src++;
5024 d = (uint32_t) *dst;
5026 *dst++ = (uint8_t) pack_1x64_32 (
5027 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5035 /* -------------------------------------------------------------------------
5036 * composite_add_8888_8_8
5040 sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
5042 pixman_image_t * src_image,
5043 pixman_image_t * mask_image,
5044 pixman_image_t * dst_image,
5054 uint8_t *dst_line, *dst;
5055 uint8_t *mask_line, *mask;
5056 int dst_stride, mask_stride;
5063 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5064 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5066 PIXMAN_IMAGE_GET_LINE (
5067 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5068 PIXMAN_IMAGE_GET_LINE (
5069 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5071 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5075 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5080 dst_line += dst_stride;
5082 mask_line += mask_stride;
5085 /* call prefetch hint to optimize cache load*/
5086 cache_prefetch ((__m128i*)mask);
5087 cache_prefetch ((__m128i*)dst);
5089 while (w && ((unsigned long)dst & 15))
5091 m = (uint32_t) *mask++;
5092 d = (uint32_t) *dst;
5094 *dst++ = (uint8_t) pack_1x64_32 (
5097 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5098 unpack_32_1x64 (d)));
5102 /* call prefetch hint to optimize cache load*/
5103 cache_prefetch ((__m128i*)mask);
5104 cache_prefetch ((__m128i*)dst);
5108 /* fill cache line with next memory */
5109 cache_prefetch_next ((__m128i*)mask);
5110 cache_prefetch_next ((__m128i*)dst);
5112 xmm_mask = load_128_unaligned ((__m128i*)mask);
5113 xmm_dst = load_128_aligned ((__m128i*)dst);
5115 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5116 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5118 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5119 &xmm_mask_lo, &xmm_mask_hi,
5120 &xmm_mask_lo, &xmm_mask_hi);
5122 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5123 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5126 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5135 m = (uint32_t) *mask++;
5136 d = (uint32_t) *dst;
5138 *dst++ = (uint8_t) pack_1x64_32 (
5141 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5142 unpack_32_1x64 (d)));
5151 /* ----------------------------------------------------------------------
5152 * composite_add_8000_8000
5156 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5158 pixman_image_t * src_image,
5159 pixman_image_t * mask_image,
5160 pixman_image_t * dst_image,
5170 uint8_t *dst_line, *dst;
5171 uint8_t *src_line, *src;
5172 int dst_stride, src_stride;
5176 PIXMAN_IMAGE_GET_LINE (
5177 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5178 PIXMAN_IMAGE_GET_LINE (
5179 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5186 /* call prefetch hint to optimize cache load*/
5187 cache_prefetch ((__m128i*)src);
5188 cache_prefetch ((__m128i*)dst);
5190 dst_line += dst_stride;
5191 src_line += src_stride;
5195 while (w && (unsigned long)dst & 3)
5197 t = (*dst) + (*src++);
5198 *dst++ = t | (0 - (t >> 8));
5202 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5212 t = (*dst) + (*src++);
5213 *dst++ = t | (0 - (t >> 8));
5221 /* ---------------------------------------------------------------------
5222 * composite_add_8888_8888
5225 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5227 pixman_image_t * src_image,
5228 pixman_image_t * mask_image,
5229 pixman_image_t * dst_image,
5239 uint32_t *dst_line, *dst;
5240 uint32_t *src_line, *src;
5241 int dst_stride, src_stride;
5243 PIXMAN_IMAGE_GET_LINE (
5244 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5245 PIXMAN_IMAGE_GET_LINE (
5246 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5251 dst_line += dst_stride;
5253 src_line += src_stride;
5255 core_combine_add_u_sse2 (dst, src, NULL, width);
5261 /* -------------------------------------------------------------------------------------------------
5262 * sse2_composite_copy_area
5265 static pixman_bool_t
5266 pixman_blt_sse2 (uint32_t *src_bits,
5279 uint8_t * src_bytes;
5280 uint8_t * dst_bytes;
5283 if (src_bpp != dst_bpp)
5288 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5289 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5290 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5291 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5292 byte_width = 2 * width;
5296 else if (src_bpp == 32)
5298 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5299 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5300 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5301 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5302 byte_width = 4 * width;
5311 cache_prefetch ((__m128i*)src_bytes);
5312 cache_prefetch ((__m128i*)dst_bytes);
5317 uint8_t *s = src_bytes;
5318 uint8_t *d = dst_bytes;
5319 src_bytes += src_stride;
5320 dst_bytes += dst_stride;
5323 cache_prefetch_next ((__m128i*)s);
5324 cache_prefetch_next ((__m128i*)d);
5326 while (w >= 2 && ((unsigned long)d & 3))
5328 *(uint16_t *)d = *(uint16_t *)s;
5334 while (w >= 4 && ((unsigned long)d & 15))
5336 *(uint32_t *)d = *(uint32_t *)s;
5343 cache_prefetch_next ((__m128i*)s);
5344 cache_prefetch_next ((__m128i*)d);
5348 __m128i xmm0, xmm1, xmm2, xmm3;
5350 /* 128 bytes ahead */
5351 cache_prefetch (((__m128i*)s) + 8);
5352 cache_prefetch (((__m128i*)d) + 8);
5354 xmm0 = load_128_unaligned ((__m128i*)(s));
5355 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5356 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5357 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5359 save_128_aligned ((__m128i*)(d), xmm0);
5360 save_128_aligned ((__m128i*)(d + 16), xmm1);
5361 save_128_aligned ((__m128i*)(d + 32), xmm2);
5362 save_128_aligned ((__m128i*)(d + 48), xmm3);
5369 cache_prefetch_next ((__m128i*)s);
5370 cache_prefetch_next ((__m128i*)d);
5374 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5381 cache_prefetch_next ((__m128i*)s);
5382 cache_prefetch_next ((__m128i*)d);
5386 *(uint32_t *)d = *(uint32_t *)s;
5395 *(uint16_t *)d = *(uint16_t *)s;
5408 sse2_composite_copy_area (pixman_implementation_t *imp,
5410 pixman_image_t * src_image,
5411 pixman_image_t * mask_image,
5412 pixman_image_t * dst_image,
5422 pixman_blt_sse2 (src_image->bits.bits,
5423 dst_image->bits.bits,
5424 src_image->bits.rowstride,
5425 dst_image->bits.rowstride,
5426 PIXMAN_FORMAT_BPP (src_image->bits.format),
5427 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5428 src_x, src_y, dest_x, dest_y, width, height);
5432 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
5434 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5436 pixman_image_t * src_image,
5437 pixman_image_t * mask_image,
5438 pixman_image_t * dst_image,
5448 uint32_t *src, *src_line, s;
5449 uint32_t *dst, *dst_line, d;
5450 uint8_t *mask, *mask_line;
5452 int src_stride, mask_stride, dst_stride;
5455 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5456 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5457 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5459 PIXMAN_IMAGE_GET_LINE (
5460 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5461 PIXMAN_IMAGE_GET_LINE (
5462 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5463 PIXMAN_IMAGE_GET_LINE (
5464 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5469 src_line += src_stride;
5471 dst_line += dst_stride;
5473 mask_line += mask_stride;
5477 /* call prefetch hint to optimize cache load*/
5478 cache_prefetch ((__m128i*)src);
5479 cache_prefetch ((__m128i*)dst);
5480 cache_prefetch ((__m128i*)mask);
5482 while (w && (unsigned long)dst & 15)
5484 s = 0xff000000 | *src++;
5485 m = (uint32_t) *mask++;
5488 __m64 ms = unpack_32_1x64 (s);
5492 ms = in_over_1x64 (ms,
5494 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5495 unpack_32_1x64 (d));
5498 *dst++ = pack_1x64_32 (ms);
5502 /* call prefetch hint to optimize cache load*/
5503 cache_prefetch ((__m128i*)src);
5504 cache_prefetch ((__m128i*)dst);
5505 cache_prefetch ((__m128i*)mask);
5509 /* fill cache line with next memory */
5510 cache_prefetch_next ((__m128i*)src);
5511 cache_prefetch_next ((__m128i*)dst);
5512 cache_prefetch_next ((__m128i*)mask);
5514 m = *(uint32_t*) mask;
5515 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5517 if (m == 0xffffffff)
5519 save_128_aligned ((__m128i*)dst, xmm_src);
5523 xmm_dst = load_128_aligned ((__m128i*)dst);
5525 xmm_mask = _mm_unpacklo_epi16 (
5526 unpack_32_1x128 (m), _mm_setzero_si128 ());
5528 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5529 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5530 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5532 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
5533 &xmm_mask_lo, &xmm_mask_hi);
5535 in_over_2x128 (xmm_src_lo, xmm_src_hi,
5536 mask_00ff, mask_00ff,
5537 xmm_mask_lo, xmm_mask_hi,
5538 &xmm_dst_lo, &xmm_dst_hi);
5541 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5552 m = (uint32_t) *mask++;
5556 s = 0xff000000 | *src;
5566 *dst = pack_1x64_32 (
5570 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5571 unpack_32_1x64 (d)));
5587 static const pixman_fast_path_t sse2_fast_paths[] =
5589 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 },
5590 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 },
5591 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 },
5592 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 },
5593 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 },
5594 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
5595 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
5596 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
5597 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
5598 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
5599 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
5600 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5601 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5602 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5603 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5605 /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
5606 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5607 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5608 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
5609 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5611 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5612 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5613 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5614 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5615 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5616 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5617 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5618 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5619 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5620 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5621 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5622 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5623 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5624 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5625 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5626 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5627 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5628 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5629 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5630 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5631 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5632 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5633 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5634 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5635 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5636 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5637 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5638 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5640 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5641 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 },
5642 { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 },
5643 { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 },
5644 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_8888_8_8, 0 },
5646 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5647 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5648 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5649 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5650 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 },
5651 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 },
5652 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5653 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5654 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5655 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5656 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 },
5657 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 },
5659 { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 },
5660 { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 },
5666 * Work around GCC bug causing crashes in Mozilla with SSE2
5668 * When using -msse, gcc generates movdqa instructions assuming that
5669 * the stack is 16 byte aligned. Unfortunately some applications, such
5670 * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
5671 * causes the movdqa instructions to fail.
5673 * The __force_align_arg_pointer__ makes gcc generate a prologue that
5674 * realigns the stack pointer to 16 bytes.
5676 * On x86-64 this is not necessary because the standard ABI already
5677 * calls for a 16 byte aligned stack.
5679 * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
5681 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5682 __attribute__((__force_align_arg_pointer__))
5685 sse2_composite (pixman_implementation_t *imp,
5687 pixman_image_t * src,
5688 pixman_image_t * mask,
5689 pixman_image_t * dest,
5699 if (_pixman_run_fast_path (sse2_fast_paths, imp,
5700 op, src, mask, dest,
5709 _pixman_implementation_composite (imp->delegate, op,
5717 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5718 __attribute__((__force_align_arg_pointer__))
5720 static pixman_bool_t
5721 sse2_blt (pixman_implementation_t *imp,
5722 uint32_t * src_bits,
5723 uint32_t * dst_bits,
5735 if (!pixman_blt_sse2 (
5736 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5737 src_x, src_y, dst_x, dst_y, width, height))
5740 return _pixman_implementation_blt (
5742 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5743 src_x, src_y, dst_x, dst_y, width, height);
5749 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5750 __attribute__((__force_align_arg_pointer__))
5752 static pixman_bool_t
5753 sse2_fill (pixman_implementation_t *imp,
5763 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5765 return _pixman_implementation_fill (
5766 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5772 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5773 __attribute__((__force_align_arg_pointer__))
5775 pixman_implementation_t *
5776 _pixman_implementation_create_sse2 (void)
5778 pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5779 pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5781 /* SSE2 constants */
5782 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5783 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5784 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5785 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5786 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5787 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5788 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5789 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5790 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
5791 mask_0080 = create_mask_16_128 (0x0080);
5792 mask_00ff = create_mask_16_128 (0x00ff);
5793 mask_0101 = create_mask_16_128 (0x0101);
5794 mask_ffff = create_mask_16_128 (0xffff);
5795 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5796 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5799 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5800 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5802 mask_x0080 = create_mask_16_64 (0x0080);
5803 mask_x00ff = create_mask_16_64 (0x00ff);
5804 mask_x0101 = create_mask_16_64 (0x0101);
5805 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5809 /* Set up function pointers */
5811 /* SSE code patch for fbcompose.c */
5812 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5813 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5814 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5815 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5816 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5817 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5818 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5819 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5820 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5821 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5823 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5825 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5826 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5827 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5828 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5829 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5830 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5831 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5832 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5833 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5834 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5835 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5837 imp->composite = sse2_composite;
5838 imp->blt = sse2_blt;
5839 imp->fill = sse2_fill;
5844 #endif /* USE_SSE2 */