2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41 * the pixman-x64-mmx-emulation.h file contains
42 * implementations of those MMX intrinsics that
43 * are used in the SSE2 implementation.
45 # include "pixman-x64-mmx-emulation.h"
50 /* --------------------------------------------------------------------
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
79 /* ----------------------------------------------------------------------
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
85 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
91 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
98 __m128i r, g, b, rb, t;
100 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
104 rb = _mm_or_si128 (r, b);
105 t = _mm_and_si128 (rb, mask_565_fix_rb);
106 t = _mm_srli_epi32 (t, 5);
107 rb = _mm_or_si128 (rb, t);
109 t = _mm_and_si128 (g, mask_565_fix_g);
110 t = _mm_srli_epi32 (t, 6);
111 g = _mm_or_si128 (g, t);
113 return _mm_or_si128 (rb, g);
116 static force_inline void
117 unpack_565_128_4x128 (__m128i data,
125 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
128 lo = unpack_565_to_8888 (lo);
129 hi = unpack_565_to_8888 (hi);
131 unpack_128_2x128 (lo, data0, data1);
132 unpack_128_2x128 (hi, data2, data3);
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
138 return (uint16_t) (((pixel >> 8) & 0xf800) |
139 ((pixel >> 5) & 0x07e0) |
140 ((pixel >> 3) & 0x001f));
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
146 return _mm_packus_epi16 (lo, hi);
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
153 __m128i r, g1, g2, b;
155 data = pack_2x128_128 (lo, hi);
157 r = _mm_and_si128 (data, mask_565_r);
158 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
162 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
168 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169 pack_565_2x128_128 (*xmm2, *xmm3));
172 static force_inline int
173 is_opaque (__m128i x)
175 __m128i ffs = _mm_cmpeq_epi8 (x, x);
177 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
180 static force_inline int
183 return _mm_movemask_epi8 (
184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
187 static force_inline int
188 is_transparent (__m128i x)
190 return (_mm_movemask_epi8 (
191 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
197 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
203 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204 _MM_SHUFFLE (3, 3, 3, 3)),
205 _MM_SHUFFLE (3, 3, 3, 3));
208 static force_inline void
209 expand_alpha_2x128 (__m128i data_lo,
216 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
219 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i data_lo,
231 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
247 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249 lo = _mm_adds_epu16 (lo, mask_0080);
250 hi = _mm_adds_epu16 (hi, mask_0080);
251 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
258 __m128i* alpha_dst_lo,
259 __m128i* alpha_dst_hi,
262 __m128i* alpha_src_lo,
263 __m128i* alpha_src_hi,
267 __m128i t1_lo, t1_hi;
268 __m128i t2_lo, t2_hi;
270 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
273 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
277 static force_inline void
278 negate_2x128 (__m128i data_lo,
283 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
287 static force_inline void
288 invert_colors_2x128 (__m128i data_lo,
295 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
311 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
313 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
315 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i src_lo,
326 __m128i alpha_lo, alpha_hi;
328 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
330 lo = _mm_or_si128 (alpha_lo, mask_alpha);
331 hi = _mm_or_si128 (alpha_hi, mask_alpha);
333 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
335 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
337 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
353 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
356 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
359 static force_inline void
360 cache_prefetch (__m128i* addr)
362 _mm_prefetch ((void const*)addr, _MM_HINT_T0);
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
368 _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
371 /* load 4 pixels from a 16-byte boundary aligned address */
372 static force_inline __m128i
373 load_128_aligned (__m128i* src)
375 return _mm_load_si128 (src);
378 /* load 4 pixels from a unaligned address */
379 static force_inline __m128i
380 load_128_unaligned (const __m128i* src)
382 return _mm_loadu_si128 (src);
385 /* save 4 pixels using Write Combining memory on a 16-byte
386 * boundary aligned address
388 static force_inline void
389 save_128_write_combining (__m128i* dst,
392 _mm_stream_si128 (dst, data);
395 /* save 4 pixels on a 16-byte boundary aligned address */
396 static force_inline void
397 save_128_aligned (__m128i* dst,
400 _mm_store_si128 (dst, data);
403 /* save 4 pixels on a unaligned address */
404 static force_inline void
405 save_128_unaligned (__m128i* dst,
408 _mm_storeu_si128 (dst, data);
411 /* ------------------------------------------------------------------
415 static force_inline __m64
416 unpack_32_1x64 (uint32_t data)
418 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
421 static force_inline __m64
422 expand_alpha_1x64 (__m64 data)
424 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
427 static force_inline __m64
428 expand_alpha_rev_1x64 (__m64 data)
430 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
433 static force_inline __m64
434 expand_pixel_8_1x64 (uint8_t data)
436 return _mm_shuffle_pi16 (
437 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
440 static force_inline __m64
441 pix_multiply_1x64 (__m64 data,
444 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
449 static force_inline __m64
450 pix_add_multiply_1x64 (__m64* src,
455 __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
456 __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
458 return _mm_adds_pu8 (t1, t2);
461 static force_inline __m64
462 negate_1x64 (__m64 data)
464 return _mm_xor_si64 (data, mask_x00ff);
467 static force_inline __m64
468 invert_colors_1x64 (__m64 data)
470 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
473 static force_inline __m64
474 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
476 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
479 static force_inline __m64
480 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
482 return over_1x64 (pix_multiply_1x64 (*src, *mask),
483 pix_multiply_1x64 (*alpha, *mask),
487 static force_inline __m64
488 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
490 __m64 alpha = expand_alpha_1x64 (src);
492 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
493 _mm_or_si64 (alpha, mask_x_alpha)),
498 static force_inline uint32_t
499 pack_1x64_32 (__m64 data)
501 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
504 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
508 * --- Expanding 565 in the low word ---
510 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
511 * m = m & (01f0003f001f);
512 * m = m * (008404100840);
515 * Note the trick here - the top word is shifted by another nibble to
516 * avoid it bumping into the middle word
518 static force_inline __m64
519 expand565_16_1x64 (uint16_t pixel)
524 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
526 t1 = _mm_slli_si64 (p, 36 - 11);
527 t2 = _mm_slli_si64 (p, 16 - 5);
529 p = _mm_or_si64 (t1, p);
530 p = _mm_or_si64 (t2, p);
531 p = _mm_and_si64 (p, mask_x565_rgb);
532 p = _mm_mullo_pi16 (p, mask_x565_unpack);
534 return _mm_srli_pi16 (p, 8);
537 /* ----------------------------------------------------------------------------
538 * Compose Core transformations
540 static force_inline uint32_t
541 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
554 ms = unpack_32_1x64 (src);
555 return pack_1x64_32 (
556 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
562 static force_inline uint32_t
563 combine1 (const uint32_t *ps, const uint32_t *pm)
571 mm = unpack_32_1x64 (*pm);
572 mm = expand_alpha_1x64 (mm);
574 ms = unpack_32_1x64 (s);
575 ms = pix_multiply_1x64 (ms, mm);
577 s = pack_1x64_32 (ms);
583 static force_inline __m128i
584 combine4 (const __m128i *ps, const __m128i *pm)
586 __m128i xmm_src_lo, xmm_src_hi;
587 __m128i xmm_msk_lo, xmm_msk_hi;
592 xmm_msk_lo = load_128_unaligned (pm);
594 if (is_transparent (xmm_msk_lo))
595 return _mm_setzero_si128 ();
598 s = load_128_unaligned (ps);
602 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
603 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
605 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
607 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
608 &xmm_msk_lo, &xmm_msk_hi,
609 &xmm_src_lo, &xmm_src_hi);
611 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
617 static force_inline void
618 core_combine_over_u_sse2 (uint32_t* pd,
625 __m128i xmm_dst_lo, xmm_dst_hi;
626 __m128i xmm_src_lo, xmm_src_hi;
627 __m128i xmm_alpha_lo, xmm_alpha_hi;
629 /* call prefetch hint to optimize cache load*/
630 cache_prefetch ((__m128i*)ps);
631 cache_prefetch ((__m128i*)pd);
633 cache_prefetch ((__m128i*)pm);
635 /* Align dst on a 16-byte boundary */
636 while (w && ((unsigned long)pd & 15))
639 s = combine1 (ps, pm);
641 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
648 /* call prefetch hint to optimize cache load*/
649 cache_prefetch ((__m128i*)ps);
650 cache_prefetch ((__m128i*)pd);
652 cache_prefetch ((__m128i*)pm);
656 /* fill cache line with next memory */
657 cache_prefetch_next ((__m128i*)ps);
658 cache_prefetch_next ((__m128i*)pd);
660 cache_prefetch_next ((__m128i*)pm);
662 /* I'm loading unaligned because I'm not sure about
663 * the address alignment.
665 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
667 if (is_opaque (xmm_src_hi))
669 save_128_aligned ((__m128i*)pd, xmm_src_hi);
671 else if (!is_zero (xmm_src_hi))
673 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
675 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
676 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
679 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
681 over_2x128 (&xmm_src_lo, &xmm_src_hi,
682 &xmm_alpha_lo, &xmm_alpha_hi,
683 &xmm_dst_lo, &xmm_dst_hi);
685 /* rebuid the 4 pixel data and save*/
686 save_128_aligned ((__m128i*)pd,
687 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
700 s = combine1 (ps, pm);
702 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
711 static force_inline void
712 core_combine_over_reverse_u_sse2 (uint32_t* pd,
719 __m128i xmm_dst_lo, xmm_dst_hi;
720 __m128i xmm_src_lo, xmm_src_hi;
721 __m128i xmm_alpha_lo, xmm_alpha_hi;
723 /* call prefetch hint to optimize cache load*/
724 cache_prefetch ((__m128i*)ps);
725 cache_prefetch ((__m128i*)pd);
726 cache_prefetch ((__m128i*)pm);
728 /* Align dst on a 16-byte boundary */
730 ((unsigned long)pd & 15))
733 s = combine1 (ps, pm);
735 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
742 /* call prefetch hint to optimize cache load*/
743 cache_prefetch ((__m128i*)ps);
744 cache_prefetch ((__m128i*)pd);
745 cache_prefetch ((__m128i*)pm);
749 /* fill cache line with next memory */
750 cache_prefetch_next ((__m128i*)ps);
751 cache_prefetch_next ((__m128i*)pd);
752 cache_prefetch_next ((__m128i*)pm);
754 /* I'm loading unaligned because I'm not sure
755 * about the address alignment.
757 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
758 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
760 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
761 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
763 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
764 &xmm_alpha_lo, &xmm_alpha_hi);
766 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
767 &xmm_alpha_lo, &xmm_alpha_hi,
768 &xmm_src_lo, &xmm_src_hi);
770 /* rebuid the 4 pixel data and save*/
771 save_128_aligned ((__m128i*)pd,
772 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
785 s = combine1 (ps, pm);
787 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
795 static force_inline uint32_t
796 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
798 uint32_t maska = src >> 24;
804 else if (maska != 0xff)
806 return pack_1x64_32 (
807 pix_multiply_1x64 (unpack_32_1x64 (dst),
808 expand_alpha_1x64 (unpack_32_1x64 (src))));
814 static force_inline void
815 core_combine_in_u_sse2 (uint32_t* pd,
822 __m128i xmm_src_lo, xmm_src_hi;
823 __m128i xmm_dst_lo, xmm_dst_hi;
825 /* call prefetch hint to optimize cache load*/
826 cache_prefetch ((__m128i*)ps);
827 cache_prefetch ((__m128i*)pd);
828 cache_prefetch ((__m128i*)pm);
830 while (w && ((unsigned long) pd & 15))
832 s = combine1 (ps, pm);
835 *pd++ = core_combine_in_u_pixelsse2 (d, s);
842 /* call prefetch hint to optimize cache load*/
843 cache_prefetch ((__m128i*)ps);
844 cache_prefetch ((__m128i*)pd);
845 cache_prefetch ((__m128i*)pm);
849 /* fill cache line with next memory */
850 cache_prefetch_next ((__m128i*)ps);
851 cache_prefetch_next ((__m128i*)pd);
852 cache_prefetch_next ((__m128i*)pm);
854 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
855 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
857 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
858 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
860 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
861 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
862 &xmm_dst_lo, &xmm_dst_hi,
863 &xmm_dst_lo, &xmm_dst_hi);
865 save_128_aligned ((__m128i*)pd,
866 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
877 s = combine1 (ps, pm);
880 *pd++ = core_combine_in_u_pixelsse2 (d, s);
888 static force_inline void
889 core_combine_reverse_in_u_sse2 (uint32_t* pd,
896 __m128i xmm_src_lo, xmm_src_hi;
897 __m128i xmm_dst_lo, xmm_dst_hi;
899 /* call prefetch hint to optimize cache load*/
900 cache_prefetch ((__m128i*)ps);
901 cache_prefetch ((__m128i*)pd);
902 cache_prefetch ((__m128i*)pm);
904 while (w && ((unsigned long) pd & 15))
906 s = combine1 (ps, pm);
909 *pd++ = core_combine_in_u_pixelsse2 (s, d);
916 /* call prefetch hint to optimize cache load*/
917 cache_prefetch ((__m128i*)ps);
918 cache_prefetch ((__m128i*)pd);
919 cache_prefetch ((__m128i*)pm);
923 /* fill cache line with next memory */
924 cache_prefetch_next ((__m128i*)ps);
925 cache_prefetch_next ((__m128i*)pd);
926 cache_prefetch_next ((__m128i*)pm);
928 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
929 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
931 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
932 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
934 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
935 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
936 &xmm_src_lo, &xmm_src_hi,
937 &xmm_dst_lo, &xmm_dst_hi);
940 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
951 s = combine1 (ps, pm);
954 *pd++ = core_combine_in_u_pixelsse2 (s, d);
962 static force_inline void
963 core_combine_reverse_out_u_sse2 (uint32_t* pd,
968 /* call prefetch hint to optimize cache load*/
969 cache_prefetch ((__m128i*)ps);
970 cache_prefetch ((__m128i*)pd);
971 cache_prefetch ((__m128i*)pm);
973 while (w && ((unsigned long) pd & 15))
975 uint32_t s = combine1 (ps, pm);
978 *pd++ = pack_1x64_32 (
980 unpack_32_1x64 (d), negate_1x64 (
981 expand_alpha_1x64 (unpack_32_1x64 (s)))));
989 /* call prefetch hint to optimize cache load*/
990 cache_prefetch ((__m128i*)ps);
991 cache_prefetch ((__m128i*)pd);
992 cache_prefetch ((__m128i*)pm);
996 __m128i xmm_src_lo, xmm_src_hi;
997 __m128i xmm_dst_lo, xmm_dst_hi;
999 /* fill cache line with next memory */
1000 cache_prefetch_next ((__m128i*)ps);
1001 cache_prefetch_next ((__m128i*)pd);
1002 cache_prefetch_next ((__m128i*)pm);
1004 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1005 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1007 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1008 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1010 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1011 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1013 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1014 &xmm_src_lo, &xmm_src_hi,
1015 &xmm_dst_lo, &xmm_dst_hi);
1018 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1030 uint32_t s = combine1 (ps, pm);
1033 *pd++ = pack_1x64_32 (
1035 unpack_32_1x64 (d), negate_1x64 (
1036 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1044 static force_inline void
1045 core_combine_out_u_sse2 (uint32_t* pd,
1050 /* call prefetch hint to optimize cache load*/
1051 cache_prefetch ((__m128i*)ps);
1052 cache_prefetch ((__m128i*)pd);
1053 cache_prefetch ((__m128i*)pm);
1055 while (w && ((unsigned long) pd & 15))
1057 uint32_t s = combine1 (ps, pm);
1060 *pd++ = pack_1x64_32 (
1062 unpack_32_1x64 (s), negate_1x64 (
1063 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1070 /* call prefetch hint to optimize cache load*/
1071 cache_prefetch ((__m128i*)ps);
1072 cache_prefetch ((__m128i*)pd);
1073 cache_prefetch ((__m128i*)pm);
1077 __m128i xmm_src_lo, xmm_src_hi;
1078 __m128i xmm_dst_lo, xmm_dst_hi;
1080 /* fill cache line with next memory */
1081 cache_prefetch_next ((__m128i*)ps);
1082 cache_prefetch_next ((__m128i*)pd);
1083 cache_prefetch_next ((__m128i*)pm);
1085 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1086 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1088 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1089 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1091 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1092 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1094 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1095 &xmm_dst_lo, &xmm_dst_hi,
1096 &xmm_dst_lo, &xmm_dst_hi);
1099 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1110 uint32_t s = combine1 (ps, pm);
1113 *pd++ = pack_1x64_32 (
1115 unpack_32_1x64 (s), negate_1x64 (
1116 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1124 static force_inline uint32_t
1125 core_combine_atop_u_pixel_sse2 (uint32_t src,
1128 __m64 s = unpack_32_1x64 (src);
1129 __m64 d = unpack_32_1x64 (dst);
1131 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1132 __m64 da = expand_alpha_1x64 (d);
1134 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1137 static force_inline void
1138 core_combine_atop_u_sse2 (uint32_t* pd,
1145 __m128i xmm_src_lo, xmm_src_hi;
1146 __m128i xmm_dst_lo, xmm_dst_hi;
1147 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1148 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1150 /* call prefetch hint to optimize cache load*/
1151 cache_prefetch ((__m128i*)ps);
1152 cache_prefetch ((__m128i*)pd);
1153 cache_prefetch ((__m128i*)pm);
1155 while (w && ((unsigned long) pd & 15))
1157 s = combine1 (ps, pm);
1160 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1167 /* call prefetch hint to optimize cache load*/
1168 cache_prefetch ((__m128i*)ps);
1169 cache_prefetch ((__m128i*)pd);
1170 cache_prefetch ((__m128i*)pm);
1174 /* fill cache line with next memory */
1175 cache_prefetch_next ((__m128i*)ps);
1176 cache_prefetch_next ((__m128i*)pd);
1177 cache_prefetch_next ((__m128i*)pm);
1179 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1180 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1182 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1183 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1185 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1186 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1187 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1188 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1190 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1191 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1193 pix_add_multiply_2x128 (
1194 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1195 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1196 &xmm_dst_lo, &xmm_dst_hi);
1199 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1210 s = combine1 (ps, pm);
1213 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1221 static force_inline uint32_t
1222 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1225 __m64 s = unpack_32_1x64 (src);
1226 __m64 d = unpack_32_1x64 (dst);
1228 __m64 sa = expand_alpha_1x64 (s);
1229 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1231 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1234 static force_inline void
1235 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1242 __m128i xmm_src_lo, xmm_src_hi;
1243 __m128i xmm_dst_lo, xmm_dst_hi;
1244 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1245 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1247 /* call prefetch hint to optimize cache load*/
1248 cache_prefetch ((__m128i*)ps);
1249 cache_prefetch ((__m128i*)pd);
1250 cache_prefetch ((__m128i*)pm);
1252 while (w && ((unsigned long) pd & 15))
1254 s = combine1 (ps, pm);
1257 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1264 /* call prefetch hint to optimize cache load*/
1265 cache_prefetch ((__m128i*)ps);
1266 cache_prefetch ((__m128i*)pd);
1267 cache_prefetch ((__m128i*)pm);
1271 /* fill cache line with next memory */
1272 cache_prefetch_next ((__m128i*)ps);
1273 cache_prefetch_next ((__m128i*)pd);
1274 cache_prefetch_next ((__m128i*)pm);
1276 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1277 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1279 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1280 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1282 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1283 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1284 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1285 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1287 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1288 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1290 pix_add_multiply_2x128 (
1291 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1292 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1293 &xmm_dst_lo, &xmm_dst_hi);
1296 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1307 s = combine1 (ps, pm);
1310 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1318 static force_inline uint32_t
1319 core_combine_xor_u_pixel_sse2 (uint32_t src,
1322 __m64 s = unpack_32_1x64 (src);
1323 __m64 d = unpack_32_1x64 (dst);
1325 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1326 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1328 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1331 static force_inline void
1332 core_combine_xor_u_sse2 (uint32_t* dst,
1333 const uint32_t* src,
1334 const uint32_t *mask,
1340 const uint32_t* ps = src;
1341 const uint32_t* pm = mask;
1343 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1344 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1345 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1346 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1348 /* call prefetch hint to optimize cache load*/
1349 cache_prefetch ((__m128i*)ps);
1350 cache_prefetch ((__m128i*)pd);
1351 cache_prefetch ((__m128i*)pm);
1353 while (w && ((unsigned long) pd & 15))
1355 s = combine1 (ps, pm);
1358 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1365 /* call prefetch hint to optimize cache load*/
1366 cache_prefetch ((__m128i*)ps);
1367 cache_prefetch ((__m128i*)pd);
1368 cache_prefetch ((__m128i*)pm);
1372 /* fill cache line with next memory */
1373 cache_prefetch_next ((__m128i*)ps);
1374 cache_prefetch_next ((__m128i*)pd);
1375 cache_prefetch_next ((__m128i*)pm);
1377 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1378 xmm_dst = load_128_aligned ((__m128i*) pd);
1380 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1381 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1383 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1384 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1385 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1386 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1388 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1389 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1390 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1391 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1393 pix_add_multiply_2x128 (
1394 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1395 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1396 &xmm_dst_lo, &xmm_dst_hi);
1399 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1410 s = combine1 (ps, pm);
1413 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1421 static force_inline void
1422 core_combine_add_u_sse2 (uint32_t* dst,
1423 const uint32_t* src,
1424 const uint32_t* mask,
1430 const uint32_t* ps = src;
1431 const uint32_t* pm = mask;
1433 /* call prefetch hint to optimize cache load*/
1434 cache_prefetch ((__m128i*)ps);
1435 cache_prefetch ((__m128i*)pd);
1436 cache_prefetch ((__m128i*)pm);
1438 while (w && (unsigned long)pd & 15)
1440 s = combine1 (ps, pm);
1446 *pd++ = _mm_cvtsi64_si32 (
1447 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1451 /* call prefetch hint to optimize cache load*/
1452 cache_prefetch ((__m128i*)ps);
1453 cache_prefetch ((__m128i*)pd);
1454 cache_prefetch ((__m128i*)pm);
1460 /* fill cache line with next memory */
1461 cache_prefetch_next ((__m128i*)ps);
1462 cache_prefetch_next ((__m128i*)pd);
1463 cache_prefetch_next ((__m128i*)pm);
1465 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1468 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1479 s = combine1 (ps, pm);
1483 *pd++ = _mm_cvtsi64_si32 (
1484 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1490 static force_inline uint32_t
1491 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1494 __m64 ms = unpack_32_1x64 (src);
1495 __m64 md = unpack_32_1x64 (dst);
1496 uint32_t sa = src >> 24;
1497 uint32_t da = ~dst >> 24;
1501 ms = pix_multiply_1x64 (
1502 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1505 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1508 static force_inline void
1509 core_combine_saturate_u_sse2 (uint32_t * pd,
1517 __m128i xmm_src, xmm_dst;
1519 /* call prefetch hint to optimize cache load*/
1520 cache_prefetch ((__m128i*)ps);
1521 cache_prefetch ((__m128i*)pd);
1522 cache_prefetch ((__m128i*)pm);
1524 while (w && (unsigned long)pd & 15)
1526 s = combine1 (ps, pm);
1529 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1536 /* call prefetch hint to optimize cache load*/
1537 cache_prefetch ((__m128i*)ps);
1538 cache_prefetch ((__m128i*)pd);
1539 cache_prefetch ((__m128i*)pm);
1543 /* fill cache line with next memory */
1544 cache_prefetch_next ((__m128i*)ps);
1545 cache_prefetch_next ((__m128i*)pd);
1546 cache_prefetch_next ((__m128i*)pm);
1548 xmm_dst = load_128_aligned ((__m128i*)pd);
1549 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1551 pack_cmp = _mm_movemask_epi8 (
1553 _mm_srli_epi32 (xmm_src, 24),
1554 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1556 /* if some alpha src is grater than respective ~alpha dst */
1559 s = combine1 (ps++, pm);
1561 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1565 s = combine1 (ps++, pm);
1567 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1571 s = combine1 (ps++, pm);
1573 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1577 s = combine1 (ps++, pm);
1579 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1585 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1598 s = combine1 (ps, pm);
1601 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1608 static force_inline void
1609 core_combine_src_ca_sse2 (uint32_t* pd,
1616 __m128i xmm_src_lo, xmm_src_hi;
1617 __m128i xmm_mask_lo, xmm_mask_hi;
1618 __m128i xmm_dst_lo, xmm_dst_hi;
1620 /* call prefetch hint to optimize cache load*/
1621 cache_prefetch ((__m128i*)ps);
1622 cache_prefetch ((__m128i*)pd);
1623 cache_prefetch ((__m128i*)pm);
1625 while (w && (unsigned long)pd & 15)
1629 *pd++ = pack_1x64_32 (
1630 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1634 /* call prefetch hint to optimize cache load*/
1635 cache_prefetch ((__m128i*)ps);
1636 cache_prefetch ((__m128i*)pd);
1637 cache_prefetch ((__m128i*)pm);
1641 /* fill cache line with next memory */
1642 cache_prefetch_next ((__m128i*)ps);
1643 cache_prefetch_next ((__m128i*)pd);
1644 cache_prefetch_next ((__m128i*)pm);
1646 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1647 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1649 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1650 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1652 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1653 &xmm_mask_lo, &xmm_mask_hi,
1654 &xmm_dst_lo, &xmm_dst_hi);
1657 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1669 *pd++ = pack_1x64_32 (
1670 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1675 static force_inline uint32_t
1676 core_combine_over_ca_pixel_sse2 (uint32_t src,
1680 __m64 s = unpack_32_1x64 (src);
1681 __m64 expAlpha = expand_alpha_1x64 (s);
1682 __m64 unpk_mask = unpack_32_1x64 (mask);
1683 __m64 unpk_dst = unpack_32_1x64 (dst);
1685 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1688 static force_inline void
1689 core_combine_over_ca_sse2 (uint32_t* pd,
1696 __m128i xmm_alpha_lo, xmm_alpha_hi;
1697 __m128i xmm_src_lo, xmm_src_hi;
1698 __m128i xmm_dst_lo, xmm_dst_hi;
1699 __m128i xmm_mask_lo, xmm_mask_hi;
1701 /* call prefetch hint to optimize cache load*/
1702 cache_prefetch ((__m128i*)ps);
1703 cache_prefetch ((__m128i*)pd);
1704 cache_prefetch ((__m128i*)pm);
1706 while (w && (unsigned long)pd & 15)
1712 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1716 /* call prefetch hint to optimize cache load*/
1717 cache_prefetch ((__m128i*)ps);
1718 cache_prefetch ((__m128i*)pd);
1719 cache_prefetch ((__m128i*)pm);
1723 /* fill cache line with next memory */
1724 cache_prefetch_next ((__m128i*)ps);
1725 cache_prefetch_next ((__m128i*)pd);
1726 cache_prefetch_next ((__m128i*)pm);
1728 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1729 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1730 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1732 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1733 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1734 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1736 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1737 &xmm_alpha_lo, &xmm_alpha_hi);
1739 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1740 &xmm_alpha_lo, &xmm_alpha_hi,
1741 &xmm_mask_lo, &xmm_mask_hi,
1742 &xmm_dst_lo, &xmm_dst_hi);
1745 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1759 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1764 static force_inline uint32_t
1765 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1769 __m64 d = unpack_32_1x64 (dst);
1771 return pack_1x64_32 (
1772 over_1x64 (d, expand_alpha_1x64 (d),
1773 pix_multiply_1x64 (unpack_32_1x64 (src),
1774 unpack_32_1x64 (mask))));
1777 static force_inline void
1778 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1785 __m128i xmm_alpha_lo, xmm_alpha_hi;
1786 __m128i xmm_src_lo, xmm_src_hi;
1787 __m128i xmm_dst_lo, xmm_dst_hi;
1788 __m128i xmm_mask_lo, xmm_mask_hi;
1790 /* call prefetch hint to optimize cache load*/
1791 cache_prefetch ((__m128i*)ps);
1792 cache_prefetch ((__m128i*)pd);
1793 cache_prefetch ((__m128i*)pm);
1795 while (w && (unsigned long)pd & 15)
1801 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1805 /* call prefetch hint to optimize cache load*/
1806 cache_prefetch ((__m128i*)ps);
1807 cache_prefetch ((__m128i*)pd);
1808 cache_prefetch ((__m128i*)pm);
1812 /* fill cache line with next memory */
1813 cache_prefetch_next ((__m128i*)ps);
1814 cache_prefetch_next ((__m128i*)pd);
1815 cache_prefetch_next ((__m128i*)pm);
1817 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1818 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1819 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1821 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1822 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1823 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1825 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1826 &xmm_alpha_lo, &xmm_alpha_hi);
1827 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1828 &xmm_mask_lo, &xmm_mask_hi,
1829 &xmm_mask_lo, &xmm_mask_hi);
1831 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1832 &xmm_alpha_lo, &xmm_alpha_hi,
1833 &xmm_mask_lo, &xmm_mask_hi);
1836 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1850 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1855 static force_inline void
1856 core_combine_in_ca_sse2 (uint32_t * pd,
1863 __m128i xmm_alpha_lo, xmm_alpha_hi;
1864 __m128i xmm_src_lo, xmm_src_hi;
1865 __m128i xmm_dst_lo, xmm_dst_hi;
1866 __m128i xmm_mask_lo, xmm_mask_hi;
1868 /* call prefetch hint to optimize cache load*/
1869 cache_prefetch ((__m128i*)ps);
1870 cache_prefetch ((__m128i*)pd);
1871 cache_prefetch ((__m128i*)pm);
1873 while (w && (unsigned long)pd & 15)
1879 *pd++ = pack_1x64_32 (
1881 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1882 expand_alpha_1x64 (unpack_32_1x64 (d))));
1887 /* call prefetch hint to optimize cache load*/
1888 cache_prefetch ((__m128i*)ps);
1889 cache_prefetch ((__m128i*)pd);
1890 cache_prefetch ((__m128i*)pm);
1894 /* fill cache line with next memory */
1895 cache_prefetch_next ((__m128i*)ps);
1896 cache_prefetch_next ((__m128i*)pd);
1897 cache_prefetch_next ((__m128i*)pm);
1899 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1900 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1901 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1903 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1904 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1905 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1907 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1908 &xmm_alpha_lo, &xmm_alpha_hi);
1910 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1911 &xmm_mask_lo, &xmm_mask_hi,
1912 &xmm_dst_lo, &xmm_dst_hi);
1914 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1915 &xmm_alpha_lo, &xmm_alpha_hi,
1916 &xmm_dst_lo, &xmm_dst_hi);
1919 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1933 *pd++ = pack_1x64_32 (
1936 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1937 expand_alpha_1x64 (unpack_32_1x64 (d))));
1943 static force_inline void
1944 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1951 __m128i xmm_alpha_lo, xmm_alpha_hi;
1952 __m128i xmm_src_lo, xmm_src_hi;
1953 __m128i xmm_dst_lo, xmm_dst_hi;
1954 __m128i xmm_mask_lo, xmm_mask_hi;
1956 /* call prefetch hint to optimize cache load*/
1957 cache_prefetch ((__m128i*)ps);
1958 cache_prefetch ((__m128i*)pd);
1959 cache_prefetch ((__m128i*)pm);
1961 while (w && (unsigned long)pd & 15)
1967 *pd++ = pack_1x64_32 (
1970 pix_multiply_1x64 (unpack_32_1x64 (m),
1971 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1975 /* call prefetch hint to optimize cache load*/
1976 cache_prefetch ((__m128i*)ps);
1977 cache_prefetch ((__m128i*)pd);
1978 cache_prefetch ((__m128i*)pm);
1982 /* fill cache line with next memory */
1983 cache_prefetch_next ((__m128i*)ps);
1984 cache_prefetch_next ((__m128i*)pd);
1985 cache_prefetch_next ((__m128i*)pm);
1987 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1988 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1989 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1991 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1992 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1993 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1995 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1996 &xmm_alpha_lo, &xmm_alpha_hi);
1997 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1998 &xmm_alpha_lo, &xmm_alpha_hi,
1999 &xmm_alpha_lo, &xmm_alpha_hi);
2001 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2002 &xmm_alpha_lo, &xmm_alpha_hi,
2003 &xmm_dst_lo, &xmm_dst_hi);
2006 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2020 *pd++ = pack_1x64_32 (
2023 pix_multiply_1x64 (unpack_32_1x64 (m),
2024 expand_alpha_1x64 (unpack_32_1x64 (s)))));
2029 static force_inline void
2030 core_combine_out_ca_sse2 (uint32_t * pd,
2037 __m128i xmm_alpha_lo, xmm_alpha_hi;
2038 __m128i xmm_src_lo, xmm_src_hi;
2039 __m128i xmm_dst_lo, xmm_dst_hi;
2040 __m128i xmm_mask_lo, xmm_mask_hi;
2042 /* call prefetch hint to optimize cache load*/
2043 cache_prefetch ((__m128i*)ps);
2044 cache_prefetch ((__m128i*)pd);
2045 cache_prefetch ((__m128i*)pm);
2047 while (w && (unsigned long)pd & 15)
2053 *pd++ = pack_1x64_32 (
2056 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2057 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2061 /* call prefetch hint to optimize cache load*/
2062 cache_prefetch ((__m128i*)ps);
2063 cache_prefetch ((__m128i*)pd);
2064 cache_prefetch ((__m128i*)pm);
2068 /* fill cache line with next memory */
2069 cache_prefetch_next ((__m128i*)ps);
2070 cache_prefetch_next ((__m128i*)pd);
2071 cache_prefetch_next ((__m128i*)pm);
2073 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2074 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2075 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2077 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2078 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2079 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2081 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2082 &xmm_alpha_lo, &xmm_alpha_hi);
2083 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2084 &xmm_alpha_lo, &xmm_alpha_hi);
2086 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2087 &xmm_mask_lo, &xmm_mask_hi,
2088 &xmm_dst_lo, &xmm_dst_hi);
2089 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2090 &xmm_alpha_lo, &xmm_alpha_hi,
2091 &xmm_dst_lo, &xmm_dst_hi);
2094 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2108 *pd++ = pack_1x64_32 (
2111 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2112 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2118 static force_inline void
2119 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
2126 __m128i xmm_alpha_lo, xmm_alpha_hi;
2127 __m128i xmm_src_lo, xmm_src_hi;
2128 __m128i xmm_dst_lo, xmm_dst_hi;
2129 __m128i xmm_mask_lo, xmm_mask_hi;
2131 /* call prefetch hint to optimize cache load*/
2132 cache_prefetch ((__m128i*)ps);
2133 cache_prefetch ((__m128i*)pd);
2134 cache_prefetch ((__m128i*)pm);
2136 while (w && (unsigned long)pd & 15)
2142 *pd++ = pack_1x64_32 (
2145 negate_1x64 (pix_multiply_1x64 (
2147 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2151 /* call prefetch hint to optimize cache load*/
2152 cache_prefetch ((__m128i*)ps);
2153 cache_prefetch ((__m128i*)pd);
2154 cache_prefetch ((__m128i*)pm);
2158 /* fill cache line with next memory */
2159 cache_prefetch_next ((__m128i*)ps);
2160 cache_prefetch_next ((__m128i*)pd);
2161 cache_prefetch_next ((__m128i*)pm);
2163 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2164 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2165 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2167 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2168 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2169 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2171 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2172 &xmm_alpha_lo, &xmm_alpha_hi);
2174 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2175 &xmm_alpha_lo, &xmm_alpha_hi,
2176 &xmm_mask_lo, &xmm_mask_hi);
2178 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2179 &xmm_mask_lo, &xmm_mask_hi);
2181 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2182 &xmm_mask_lo, &xmm_mask_hi,
2183 &xmm_dst_lo, &xmm_dst_hi);
2186 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2200 *pd++ = pack_1x64_32 (
2203 negate_1x64 (pix_multiply_1x64 (
2205 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2210 static force_inline uint32_t
2211 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2215 __m64 m = unpack_32_1x64 (mask);
2216 __m64 s = unpack_32_1x64 (src);
2217 __m64 d = unpack_32_1x64 (dst);
2218 __m64 sa = expand_alpha_1x64 (s);
2219 __m64 da = expand_alpha_1x64 (d);
2221 s = pix_multiply_1x64 (s, m);
2222 m = negate_1x64 (pix_multiply_1x64 (m, sa));
2224 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2227 static force_inline void
2228 core_combine_atop_ca_sse2 (uint32_t * pd,
2235 __m128i xmm_src_lo, xmm_src_hi;
2236 __m128i xmm_dst_lo, xmm_dst_hi;
2237 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2238 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2239 __m128i xmm_mask_lo, xmm_mask_hi;
2241 /* call prefetch hint to optimize cache load*/
2242 cache_prefetch ((__m128i*)ps);
2243 cache_prefetch ((__m128i*)pd);
2244 cache_prefetch ((__m128i*)pm);
2246 while (w && (unsigned long)pd & 15)
2252 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2256 /* call prefetch hint to optimize cache load*/
2257 cache_prefetch ((__m128i*)ps);
2258 cache_prefetch ((__m128i*)pd);
2259 cache_prefetch ((__m128i*)pm);
2263 /* fill cache line with next memory */
2264 cache_prefetch_next ((__m128i*)ps);
2265 cache_prefetch_next ((__m128i*)pd);
2266 cache_prefetch_next ((__m128i*)pm);
2268 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2269 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2270 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2272 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2273 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2274 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2276 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2277 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2278 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2279 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2281 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2282 &xmm_mask_lo, &xmm_mask_hi,
2283 &xmm_src_lo, &xmm_src_hi);
2284 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2285 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2286 &xmm_mask_lo, &xmm_mask_hi);
2288 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2290 pix_add_multiply_2x128 (
2291 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2292 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2293 &xmm_dst_lo, &xmm_dst_hi);
2296 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2310 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2315 static force_inline uint32_t
2316 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2320 __m64 m = unpack_32_1x64 (mask);
2321 __m64 s = unpack_32_1x64 (src);
2322 __m64 d = unpack_32_1x64 (dst);
2324 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2325 __m64 sa = expand_alpha_1x64 (s);
2327 s = pix_multiply_1x64 (s, m);
2328 m = pix_multiply_1x64 (m, sa);
2330 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2333 static force_inline void
2334 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2341 __m128i xmm_src_lo, xmm_src_hi;
2342 __m128i xmm_dst_lo, xmm_dst_hi;
2343 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2344 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2345 __m128i xmm_mask_lo, xmm_mask_hi;
2347 /* call prefetch hint to optimize cache load*/
2348 cache_prefetch ((__m128i*)ps);
2349 cache_prefetch ((__m128i*)pd);
2350 cache_prefetch ((__m128i*)pm);
2352 while (w && (unsigned long)pd & 15)
2358 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2362 /* call prefetch hint to optimize cache load*/
2363 cache_prefetch ((__m128i*)ps);
2364 cache_prefetch ((__m128i*)pd);
2365 cache_prefetch ((__m128i*)pm);
2369 /* fill cache line with next memory */
2370 cache_prefetch_next ((__m128i*)ps);
2371 cache_prefetch_next ((__m128i*)pd);
2372 cache_prefetch_next ((__m128i*)pm);
2374 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2375 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2376 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2378 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2379 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2380 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2382 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2383 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2384 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2385 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2387 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2388 &xmm_mask_lo, &xmm_mask_hi,
2389 &xmm_src_lo, &xmm_src_hi);
2390 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2391 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2392 &xmm_mask_lo, &xmm_mask_hi);
2394 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2395 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2397 pix_add_multiply_2x128 (
2398 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2399 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2400 &xmm_dst_lo, &xmm_dst_hi);
2403 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2417 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2422 static force_inline uint32_t
2423 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2427 __m64 a = unpack_32_1x64 (mask);
2428 __m64 s = unpack_32_1x64 (src);
2429 __m64 d = unpack_32_1x64 (dst);
2431 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2432 a, expand_alpha_1x64 (s)));
2433 __m64 dest = pix_multiply_1x64 (s, a);
2434 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2436 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2442 static force_inline void
2443 core_combine_xor_ca_sse2 (uint32_t * pd,
2450 __m128i xmm_src_lo, xmm_src_hi;
2451 __m128i xmm_dst_lo, xmm_dst_hi;
2452 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2453 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2454 __m128i xmm_mask_lo, xmm_mask_hi;
2456 /* call prefetch hint to optimize cache load*/
2457 cache_prefetch ((__m128i*)ps);
2458 cache_prefetch ((__m128i*)pd);
2459 cache_prefetch ((__m128i*)pm);
2461 while (w && (unsigned long)pd & 15)
2467 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2471 /* call prefetch hint to optimize cache load*/
2472 cache_prefetch ((__m128i*)ps);
2473 cache_prefetch ((__m128i*)pd);
2474 cache_prefetch ((__m128i*)pm);
2478 /* fill cache line with next memory */
2479 cache_prefetch_next ((__m128i*)ps);
2480 cache_prefetch_next ((__m128i*)pd);
2481 cache_prefetch_next ((__m128i*)pm);
2483 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2484 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2485 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2487 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2488 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2489 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2491 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2492 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2493 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2494 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2496 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2497 &xmm_mask_lo, &xmm_mask_hi,
2498 &xmm_src_lo, &xmm_src_hi);
2499 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2500 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2501 &xmm_mask_lo, &xmm_mask_hi);
2503 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2504 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2505 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2506 &xmm_mask_lo, &xmm_mask_hi);
2508 pix_add_multiply_2x128 (
2509 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2510 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2511 &xmm_dst_lo, &xmm_dst_hi);
2514 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2528 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2533 static force_inline void
2534 core_combine_add_ca_sse2 (uint32_t * pd,
2541 __m128i xmm_src_lo, xmm_src_hi;
2542 __m128i xmm_dst_lo, xmm_dst_hi;
2543 __m128i xmm_mask_lo, xmm_mask_hi;
2545 /* call prefetch hint to optimize cache load*/
2546 cache_prefetch ((__m128i*)ps);
2547 cache_prefetch ((__m128i*)pd);
2548 cache_prefetch ((__m128i*)pm);
2550 while (w && (unsigned long)pd & 15)
2556 *pd++ = pack_1x64_32 (
2557 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2558 unpack_32_1x64 (m)),
2559 unpack_32_1x64 (d)));
2563 /* call prefetch hint to optimize cache load*/
2564 cache_prefetch ((__m128i*)ps);
2565 cache_prefetch ((__m128i*)pd);
2566 cache_prefetch ((__m128i*)pm);
2570 /* fill cache line with next memory */
2571 cache_prefetch_next ((__m128i*)ps);
2572 cache_prefetch_next ((__m128i*)pd);
2573 cache_prefetch_next ((__m128i*)pm);
2575 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2576 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2577 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2579 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2580 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2581 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2583 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2584 &xmm_mask_lo, &xmm_mask_hi,
2585 &xmm_src_lo, &xmm_src_hi);
2588 (__m128i*)pd, pack_2x128_128 (
2589 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2590 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2604 *pd++ = pack_1x64_32 (
2605 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2606 unpack_32_1x64 (m)),
2607 unpack_32_1x64 (d)));
2612 /* ---------------------------------------------------
2613 * fb_compose_setup_sSE2
2615 static force_inline __m64
2616 create_mask_16_64 (uint16_t mask)
2618 return _mm_set1_pi16 (mask);
2621 static force_inline __m128i
2622 create_mask_16_128 (uint16_t mask)
2624 return _mm_set1_epi16 (mask);
2627 static force_inline __m64
2628 create_mask_2x32_64 (uint32_t mask0,
2631 return _mm_set_pi32 (mask0, mask1);
2634 /* Work around a code generation bug in Sun Studio 12. */
2635 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2636 # define create_mask_2x32_128(mask0, mask1) \
2637 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2639 static force_inline __m128i
2640 create_mask_2x32_128 (uint32_t mask0,
2643 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2647 /* SSE2 code patch for fbcompose.c */
2650 sse2_combine_over_u (pixman_implementation_t *imp,
2653 const uint32_t * src,
2654 const uint32_t * mask,
2657 core_combine_over_u_sse2 (dst, src, mask, width);
2662 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2665 const uint32_t * src,
2666 const uint32_t * mask,
2669 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2674 sse2_combine_in_u (pixman_implementation_t *imp,
2677 const uint32_t * src,
2678 const uint32_t * mask,
2681 core_combine_in_u_sse2 (dst, src, mask, width);
2686 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2689 const uint32_t * src,
2690 const uint32_t * mask,
2693 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2698 sse2_combine_out_u (pixman_implementation_t *imp,
2701 const uint32_t * src,
2702 const uint32_t * mask,
2705 core_combine_out_u_sse2 (dst, src, mask, width);
2710 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2713 const uint32_t * src,
2714 const uint32_t * mask,
2717 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2722 sse2_combine_atop_u (pixman_implementation_t *imp,
2725 const uint32_t * src,
2726 const uint32_t * mask,
2729 core_combine_atop_u_sse2 (dst, src, mask, width);
2734 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2737 const uint32_t * src,
2738 const uint32_t * mask,
2741 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2746 sse2_combine_xor_u (pixman_implementation_t *imp,
2749 const uint32_t * src,
2750 const uint32_t * mask,
2753 core_combine_xor_u_sse2 (dst, src, mask, width);
2758 sse2_combine_add_u (pixman_implementation_t *imp,
2761 const uint32_t * src,
2762 const uint32_t * mask,
2765 core_combine_add_u_sse2 (dst, src, mask, width);
2770 sse2_combine_saturate_u (pixman_implementation_t *imp,
2773 const uint32_t * src,
2774 const uint32_t * mask,
2777 core_combine_saturate_u_sse2 (dst, src, mask, width);
2782 sse2_combine_src_ca (pixman_implementation_t *imp,
2785 const uint32_t * src,
2786 const uint32_t * mask,
2789 core_combine_src_ca_sse2 (dst, src, mask, width);
2794 sse2_combine_over_ca (pixman_implementation_t *imp,
2797 const uint32_t * src,
2798 const uint32_t * mask,
2801 core_combine_over_ca_sse2 (dst, src, mask, width);
2806 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2809 const uint32_t * src,
2810 const uint32_t * mask,
2813 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2818 sse2_combine_in_ca (pixman_implementation_t *imp,
2821 const uint32_t * src,
2822 const uint32_t * mask,
2825 core_combine_in_ca_sse2 (dst, src, mask, width);
2830 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2833 const uint32_t * src,
2834 const uint32_t * mask,
2837 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2842 sse2_combine_out_ca (pixman_implementation_t *imp,
2845 const uint32_t * src,
2846 const uint32_t * mask,
2849 core_combine_out_ca_sse2 (dst, src, mask, width);
2854 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2857 const uint32_t * src,
2858 const uint32_t * mask,
2861 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2866 sse2_combine_atop_ca (pixman_implementation_t *imp,
2869 const uint32_t * src,
2870 const uint32_t * mask,
2873 core_combine_atop_ca_sse2 (dst, src, mask, width);
2878 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2881 const uint32_t * src,
2882 const uint32_t * mask,
2885 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2890 sse2_combine_xor_ca (pixman_implementation_t *imp,
2893 const uint32_t * src,
2894 const uint32_t * mask,
2897 core_combine_xor_ca_sse2 (dst, src, mask, width);
2902 sse2_combine_add_ca (pixman_implementation_t *imp,
2905 const uint32_t * src,
2906 const uint32_t * mask,
2909 core_combine_add_ca_sse2 (dst, src, mask, width);
2913 /* -------------------------------------------------------------------
2914 * composite_over_n_8888
2918 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2920 pixman_image_t * src_image,
2921 pixman_image_t * mask_image,
2922 pixman_image_t * dst_image,
2933 uint32_t *dst_line, *dst, d;
2936 __m128i xmm_src, xmm_alpha;
2937 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2939 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2944 PIXMAN_IMAGE_GET_LINE (
2945 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2947 xmm_src = expand_pixel_32_1x128 (src);
2948 xmm_alpha = expand_alpha_1x128 (xmm_src);
2954 /* call prefetch hint to optimize cache load*/
2955 cache_prefetch ((__m128i*)dst);
2957 dst_line += dst_stride;
2960 while (w && (unsigned long)dst & 15)
2963 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2964 _mm_movepi64_pi64 (xmm_alpha),
2965 unpack_32_1x64 (d)));
2969 cache_prefetch ((__m128i*)dst);
2973 /* fill cache line with next memory */
2974 cache_prefetch_next ((__m128i*)dst);
2976 xmm_dst = load_128_aligned ((__m128i*)dst);
2978 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2980 over_2x128 (&xmm_src, &xmm_src,
2981 &xmm_alpha, &xmm_alpha,
2982 &xmm_dst_lo, &xmm_dst_hi);
2984 /* rebuid the 4 pixel data and save*/
2986 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2995 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2996 _mm_movepi64_pi64 (xmm_alpha),
2997 unpack_32_1x64 (d)));
3005 /* ---------------------------------------------------------------------
3006 * composite_over_n_0565
3009 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3011 pixman_image_t * src_image,
3012 pixman_image_t * mask_image,
3013 pixman_image_t * dst_image,
3024 uint16_t *dst_line, *dst, d;
3027 __m128i xmm_src, xmm_alpha;
3028 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3030 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3035 PIXMAN_IMAGE_GET_LINE (
3036 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3038 xmm_src = expand_pixel_32_1x128 (src);
3039 xmm_alpha = expand_alpha_1x128 (xmm_src);
3045 /* call prefetch hint to optimize cache load*/
3046 cache_prefetch ((__m128i*)dst);
3048 dst_line += dst_stride;
3051 while (w && (unsigned long)dst & 15)
3055 *dst++ = pack_565_32_16 (
3056 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3057 _mm_movepi64_pi64 (xmm_alpha),
3058 expand565_16_1x64 (d))));
3062 /* call prefetch hint to optimize cache load*/
3063 cache_prefetch ((__m128i*)dst);
3067 /* fill cache line with next memory */
3068 cache_prefetch_next ((__m128i*)dst);
3070 xmm_dst = load_128_aligned ((__m128i*)dst);
3072 unpack_565_128_4x128 (xmm_dst,
3073 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3075 over_2x128 (&xmm_src, &xmm_src,
3076 &xmm_alpha, &xmm_alpha,
3077 &xmm_dst0, &xmm_dst1);
3078 over_2x128 (&xmm_src, &xmm_src,
3079 &xmm_alpha, &xmm_alpha,
3080 &xmm_dst2, &xmm_dst3);
3082 xmm_dst = pack_565_4x128_128 (
3083 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3085 save_128_aligned ((__m128i*)dst, xmm_dst);
3094 *dst++ = pack_565_32_16 (
3095 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3096 _mm_movepi64_pi64 (xmm_alpha),
3097 expand565_16_1x64 (d))));
3104 /* ------------------------------
3105 * composite_add_n_8888_8888_ca
3108 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3110 pixman_image_t * src_image,
3111 pixman_image_t * mask_image,
3112 pixman_image_t * dst_image,
3123 uint32_t *dst_line, d;
3124 uint32_t *mask_line, m;
3126 int dst_stride, mask_stride;
3128 __m128i xmm_src, xmm_alpha;
3130 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3132 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3134 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3140 PIXMAN_IMAGE_GET_LINE (
3141 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3142 PIXMAN_IMAGE_GET_LINE (
3143 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3145 xmm_src = _mm_unpacklo_epi8 (
3146 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3147 xmm_alpha = expand_alpha_1x128 (xmm_src);
3148 mmx_src = _mm_movepi64_pi64 (xmm_src);
3149 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3154 const uint32_t *pm = (uint32_t *)mask_line;
3155 uint32_t *pd = (uint32_t *)dst_line;
3157 dst_line += dst_stride;
3158 mask_line += mask_stride;
3160 /* call prefetch hint to optimize cache load*/
3161 cache_prefetch ((__m128i*)pd);
3162 cache_prefetch ((__m128i*)pm);
3164 while (w && (unsigned long)pd & 15)
3172 mmx_mask = unpack_32_1x64 (m);
3173 mmx_dest = unpack_32_1x64 (d);
3175 *pd = pack_1x64_32 (
3176 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3183 /* call prefetch hint to optimize cache load*/
3184 cache_prefetch ((__m128i*)pd);
3185 cache_prefetch ((__m128i*)pm);
3189 /* fill cache line with next memory */
3190 cache_prefetch_next ((__m128i*)pd);
3191 cache_prefetch_next ((__m128i*)pm);
3193 xmm_mask = load_128_unaligned ((__m128i*)pm);
3197 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3199 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3200 if (pack_cmp != 0xffff)
3202 xmm_dst = load_128_aligned ((__m128i*)pd);
3204 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3206 pix_multiply_2x128 (&xmm_src, &xmm_src,
3207 &xmm_mask_lo, &xmm_mask_hi,
3208 &xmm_mask_lo, &xmm_mask_hi);
3209 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3212 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3228 mmx_mask = unpack_32_1x64 (m);
3229 mmx_dest = unpack_32_1x64 (d);
3231 *pd = pack_1x64_32 (
3232 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3243 /* ---------------------------------------------------------------------------
3244 * composite_over_n_8888_8888_ca
3248 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3250 pixman_image_t * src_image,
3251 pixman_image_t * mask_image,
3252 pixman_image_t * dst_image,
3263 uint32_t *dst_line, d;
3264 uint32_t *mask_line, m;
3266 int dst_stride, mask_stride;
3268 __m128i xmm_src, xmm_alpha;
3269 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3270 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3272 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3274 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3279 PIXMAN_IMAGE_GET_LINE (
3280 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3281 PIXMAN_IMAGE_GET_LINE (
3282 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3284 xmm_src = _mm_unpacklo_epi8 (
3285 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3286 xmm_alpha = expand_alpha_1x128 (xmm_src);
3287 mmx_src = _mm_movepi64_pi64 (xmm_src);
3288 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3293 const uint32_t *pm = (uint32_t *)mask_line;
3294 uint32_t *pd = (uint32_t *)dst_line;
3296 dst_line += dst_stride;
3297 mask_line += mask_stride;
3299 /* call prefetch hint to optimize cache load*/
3300 cache_prefetch ((__m128i*)pd);
3301 cache_prefetch ((__m128i*)pm);
3303 while (w && (unsigned long)pd & 15)
3310 mmx_mask = unpack_32_1x64 (m);
3311 mmx_dest = unpack_32_1x64 (d);
3313 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3323 /* call prefetch hint to optimize cache load*/
3324 cache_prefetch ((__m128i*)pd);
3325 cache_prefetch ((__m128i*)pm);
3329 /* fill cache line with next memory */
3330 cache_prefetch_next ((__m128i*)pd);
3331 cache_prefetch_next ((__m128i*)pm);
3333 xmm_mask = load_128_unaligned ((__m128i*)pm);
3337 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3339 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3340 if (pack_cmp != 0xffff)
3342 xmm_dst = load_128_aligned ((__m128i*)pd);
3344 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3345 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3347 in_over_2x128 (&xmm_src, &xmm_src,
3348 &xmm_alpha, &xmm_alpha,
3349 &xmm_mask_lo, &xmm_mask_hi,
3350 &xmm_dst_lo, &xmm_dst_hi);
3353 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3368 mmx_mask = unpack_32_1x64 (m);
3369 mmx_dest = unpack_32_1x64 (d);
3371 *pd = pack_1x64_32 (
3372 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3383 /*---------------------------------------------------------------------
3384 * composite_over_8888_n_8888
3388 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3390 pixman_image_t * src_image,
3391 pixman_image_t * mask_image,
3392 pixman_image_t * dst_image,
3402 uint32_t *dst_line, *dst;
3403 uint32_t *src_line, *src;
3406 int dst_stride, src_stride;
3409 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3410 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3411 __m128i xmm_alpha_lo, xmm_alpha_hi;
3413 PIXMAN_IMAGE_GET_LINE (
3414 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3415 PIXMAN_IMAGE_GET_LINE (
3416 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3418 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3420 xmm_mask = create_mask_16_128 (mask >> 24);
3425 dst_line += dst_stride;
3427 src_line += src_stride;
3430 /* call prefetch hint to optimize cache load*/
3431 cache_prefetch ((__m128i*)dst);
3432 cache_prefetch ((__m128i*)src);
3434 while (w && (unsigned long)dst & 15)
3436 uint32_t s = *src++;
3439 __m64 ms = unpack_32_1x64 (s);
3440 __m64 alpha = expand_alpha_1x64 (ms);
3441 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3442 __m64 alpha_dst = unpack_32_1x64 (d);
3444 *dst++ = pack_1x64_32 (
3445 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3450 /* call prefetch hint to optimize cache load*/
3451 cache_prefetch ((__m128i*)dst);
3452 cache_prefetch ((__m128i*)src);
3456 /* fill cache line with next memory */
3457 cache_prefetch_next ((__m128i*)dst);
3458 cache_prefetch_next ((__m128i*)src);
3460 xmm_src = load_128_unaligned ((__m128i*)src);
3461 xmm_dst = load_128_aligned ((__m128i*)dst);
3463 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3464 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3465 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3466 &xmm_alpha_lo, &xmm_alpha_hi);
3468 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3469 &xmm_alpha_lo, &xmm_alpha_hi,
3470 &xmm_mask, &xmm_mask,
3471 &xmm_dst_lo, &xmm_dst_hi);
3474 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3483 uint32_t s = *src++;
3486 __m64 ms = unpack_32_1x64 (s);
3487 __m64 alpha = expand_alpha_1x64 (ms);
3488 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3489 __m64 dest = unpack_32_1x64 (d);
3491 *dst++ = pack_1x64_32 (
3492 in_over_1x64 (&ms, &alpha, &mask, &dest));
3501 /* ---------------------------------------------------------------------
3502 * composite_over_x888_n_8888
3505 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3507 pixman_image_t * src_image,
3508 pixman_image_t * mask_image,
3509 pixman_image_t * dst_image,
3519 uint32_t *dst_line, *dst;
3520 uint32_t *src_line, *src;
3522 int dst_stride, src_stride;
3525 __m128i xmm_mask, xmm_alpha;
3526 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3527 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3529 PIXMAN_IMAGE_GET_LINE (
3530 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3531 PIXMAN_IMAGE_GET_LINE (
3532 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3534 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3536 xmm_mask = create_mask_16_128 (mask >> 24);
3537 xmm_alpha = mask_00ff;
3542 dst_line += dst_stride;
3544 src_line += src_stride;
3547 /* call prefetch hint to optimize cache load*/
3548 cache_prefetch ((__m128i*)dst);
3549 cache_prefetch ((__m128i*)src);
3551 while (w && (unsigned long)dst & 15)
3553 uint32_t s = (*src++) | 0xff000000;
3556 __m64 src = unpack_32_1x64 (s);
3557 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3558 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3559 __m64 dest = unpack_32_1x64 (d);
3561 *dst++ = pack_1x64_32 (
3562 in_over_1x64 (&src, &alpha, &mask, &dest));
3567 /* call prefetch hint to optimize cache load*/
3568 cache_prefetch ((__m128i*)dst);
3569 cache_prefetch ((__m128i*)src);
3573 /* fill cache line with next memory */
3574 cache_prefetch_next ((__m128i*)dst);
3575 cache_prefetch_next ((__m128i*)src);
3577 xmm_src = _mm_or_si128 (
3578 load_128_unaligned ((__m128i*)src), mask_ff000000);
3579 xmm_dst = load_128_aligned ((__m128i*)dst);
3581 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3582 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3584 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3585 &xmm_alpha, &xmm_alpha,
3586 &xmm_mask, &xmm_mask,
3587 &xmm_dst_lo, &xmm_dst_hi);
3590 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3600 uint32_t s = (*src++) | 0xff000000;
3603 __m64 src = unpack_32_1x64 (s);
3604 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3605 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3606 __m64 dest = unpack_32_1x64 (d);
3608 *dst++ = pack_1x64_32 (
3609 in_over_1x64 (&src, &alpha, &mask, &dest));
3618 /* --------------------------------------------------------------------
3619 * composite_over_8888_8888
3622 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3624 pixman_image_t * src_image,
3625 pixman_image_t * mask_image,
3626 pixman_image_t * dst_image,
3636 int dst_stride, src_stride;
3637 uint32_t *dst_line, *dst;
3638 uint32_t *src_line, *src;
3640 PIXMAN_IMAGE_GET_LINE (
3641 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3642 PIXMAN_IMAGE_GET_LINE (
3643 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3650 core_combine_over_u_sse2 (dst, src, NULL, width);
3658 /* ------------------------------------------------------------------
3659 * composite_over_8888_0565
3661 static force_inline uint16_t
3662 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3666 ms = unpack_32_1x64 (src);
3667 return pack_565_32_16 (
3670 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3674 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3676 pixman_image_t * src_image,
3677 pixman_image_t * mask_image,
3678 pixman_image_t * dst_image,
3688 uint16_t *dst_line, *dst, d;
3689 uint32_t *src_line, *src, s;
3690 int dst_stride, src_stride;
3693 __m128i xmm_alpha_lo, xmm_alpha_hi;
3694 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3695 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3697 PIXMAN_IMAGE_GET_LINE (
3698 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3699 PIXMAN_IMAGE_GET_LINE (
3700 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3705 * I copy the code from MMX one and keep the fixme.
3706 * If it's a problem there, probably is a problem here.
3708 assert (src_image->drawable == mask_image->drawable);
3716 /* call prefetch hint to optimize cache load*/
3717 cache_prefetch ((__m128i*)src);
3718 cache_prefetch ((__m128i*)dst);
3720 dst_line += dst_stride;
3721 src_line += src_stride;
3724 /* Align dst on a 16-byte boundary */
3726 ((unsigned long)dst & 15))
3731 *dst++ = composite_over_8888_0565pixel (s, d);
3735 /* call prefetch hint to optimize cache load*/
3736 cache_prefetch ((__m128i*)src);
3737 cache_prefetch ((__m128i*)dst);
3739 /* It's a 8 pixel loop */
3742 /* fill cache line with next memory */
3743 cache_prefetch_next ((__m128i*)src);
3744 cache_prefetch_next ((__m128i*)dst);
3746 /* I'm loading unaligned because I'm not sure
3747 * about the address alignment.
3749 xmm_src = load_128_unaligned ((__m128i*) src);
3750 xmm_dst = load_128_aligned ((__m128i*) dst);
3753 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3754 unpack_565_128_4x128 (xmm_dst,
3755 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3756 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3757 &xmm_alpha_lo, &xmm_alpha_hi);
3759 /* I'm loading next 4 pixels from memory
3760 * before to optimze the memory read.
3762 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3764 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3765 &xmm_alpha_lo, &xmm_alpha_hi,
3766 &xmm_dst0, &xmm_dst1);
3769 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3770 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3771 &xmm_alpha_lo, &xmm_alpha_hi);
3773 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3774 &xmm_alpha_lo, &xmm_alpha_hi,
3775 &xmm_dst2, &xmm_dst3);
3778 (__m128i*)dst, pack_565_4x128_128 (
3779 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3791 *dst++ = composite_over_8888_0565pixel (s, d);
3798 /* -----------------------------------------------------------------
3799 * composite_over_n_8_8888
3803 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3805 pixman_image_t * src_image,
3806 pixman_image_t * mask_image,
3807 pixman_image_t * dst_image,
3818 uint32_t *dst_line, *dst;
3819 uint8_t *mask_line, *mask;
3820 int dst_stride, mask_stride;
3824 __m128i xmm_src, xmm_alpha, xmm_def;
3825 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3826 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3828 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3830 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3836 PIXMAN_IMAGE_GET_LINE (
3837 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3838 PIXMAN_IMAGE_GET_LINE (
3839 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3841 xmm_def = create_mask_2x32_128 (src, src);
3842 xmm_src = expand_pixel_32_1x128 (src);
3843 xmm_alpha = expand_alpha_1x128 (xmm_src);
3844 mmx_src = _mm_movepi64_pi64 (xmm_src);
3845 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3850 dst_line += dst_stride;
3852 mask_line += mask_stride;
3855 /* call prefetch hint to optimize cache load*/
3856 cache_prefetch ((__m128i*)mask);
3857 cache_prefetch ((__m128i*)dst);
3859 while (w && (unsigned long)dst & 15)
3861 uint8_t m = *mask++;
3866 mmx_mask = expand_pixel_8_1x64 (m);
3867 mmx_dest = unpack_32_1x64 (d);
3869 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3879 /* call prefetch hint to optimize cache load*/
3880 cache_prefetch ((__m128i*)mask);
3881 cache_prefetch ((__m128i*)dst);
3885 /* fill cache line with next memory */
3886 cache_prefetch_next ((__m128i*)mask);
3887 cache_prefetch_next ((__m128i*)dst);
3889 m = *((uint32_t*)mask);
3891 if (srca == 0xff && m == 0xffffffff)
3893 save_128_aligned ((__m128i*)dst, xmm_def);
3897 xmm_dst = load_128_aligned ((__m128i*) dst);
3898 xmm_mask = unpack_32_1x128 (m);
3899 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3902 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3903 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3905 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3906 &xmm_mask_lo, &xmm_mask_hi);
3908 in_over_2x128 (&xmm_src, &xmm_src,
3909 &xmm_alpha, &xmm_alpha,
3910 &xmm_mask_lo, &xmm_mask_hi,
3911 &xmm_dst_lo, &xmm_dst_hi);
3914 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3924 uint8_t m = *mask++;
3929 mmx_mask = expand_pixel_8_1x64 (m);
3930 mmx_dest = unpack_32_1x64 (d);
3932 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3946 /* ----------------------------------------------------------------
3947 * composite_over_n_8_8888
3951 pixman_fill_sse2 (uint32_t *bits,
3960 uint32_t byte_width;
3965 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3968 if (bpp != 16 && bpp != 32)
3973 stride = stride * (int) sizeof (uint32_t) / 2;
3974 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3975 byte_width = 2 * width;
3980 stride = stride * (int) sizeof (uint32_t) / 4;
3981 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3982 byte_width = 4 * width;
3986 cache_prefetch ((__m128i*)byte_line);
3987 xmm_def = create_mask_2x32_128 (data, data);
3992 uint8_t *d = byte_line;
3993 byte_line += stride;
3997 cache_prefetch_next ((__m128i*)d);
3999 while (w >= 2 && ((unsigned long)d & 3))
4001 *(uint16_t *)d = data;
4006 while (w >= 4 && ((unsigned long)d & 15))
4008 *(uint32_t *)d = data;
4014 cache_prefetch_next ((__m128i*)d);
4018 cache_prefetch (((__m128i*)d) + 12);
4020 save_128_aligned ((__m128i*)(d), xmm_def);
4021 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4022 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4023 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4024 save_128_aligned ((__m128i*)(d + 64), xmm_def);
4025 save_128_aligned ((__m128i*)(d + 80), xmm_def);
4026 save_128_aligned ((__m128i*)(d + 96), xmm_def);
4027 save_128_aligned ((__m128i*)(d + 112), xmm_def);
4035 cache_prefetch (((__m128i*)d) + 8);
4037 save_128_aligned ((__m128i*)(d), xmm_def);
4038 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4039 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4040 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4046 cache_prefetch_next ((__m128i*)d);
4050 save_128_aligned ((__m128i*)(d), xmm_def);
4051 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4059 save_128_aligned ((__m128i*)(d), xmm_def);
4065 cache_prefetch_next ((__m128i*)d);
4069 *(uint32_t *)d = data;
4077 *(uint16_t *)d = data;
4088 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4090 pixman_image_t * src_image,
4091 pixman_image_t * mask_image,
4092 pixman_image_t * dst_image,
4103 uint32_t *dst_line, *dst;
4104 uint8_t *mask_line, *mask;
4105 int dst_stride, mask_stride;
4109 __m128i xmm_src, xmm_def;
4110 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4112 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4117 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4118 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4119 dest_x, dest_y, width, height, 0);
4123 PIXMAN_IMAGE_GET_LINE (
4124 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4125 PIXMAN_IMAGE_GET_LINE (
4126 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4128 xmm_def = create_mask_2x32_128 (src, src);
4129 xmm_src = expand_pixel_32_1x128 (src);
4134 dst_line += dst_stride;
4136 mask_line += mask_stride;
4139 /* call prefetch hint to optimize cache load*/
4140 cache_prefetch ((__m128i*)mask);
4141 cache_prefetch ((__m128i*)dst);
4143 while (w && (unsigned long)dst & 15)
4145 uint8_t m = *mask++;
4149 *dst = pack_1x64_32 (
4151 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4162 /* call prefetch hint to optimize cache load*/
4163 cache_prefetch ((__m128i*)mask);
4164 cache_prefetch ((__m128i*)dst);
4168 /* fill cache line with next memory */
4169 cache_prefetch_next ((__m128i*)mask);
4170 cache_prefetch_next ((__m128i*)dst);
4172 m = *((uint32_t*)mask);
4174 if (srca == 0xff && m == 0xffffffff)
4176 save_128_aligned ((__m128i*)dst, xmm_def);
4180 xmm_mask = unpack_32_1x128 (m);
4181 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4184 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4186 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4187 &xmm_mask_lo, &xmm_mask_hi);
4189 pix_multiply_2x128 (&xmm_src, &xmm_src,
4190 &xmm_mask_lo, &xmm_mask_hi,
4191 &xmm_mask_lo, &xmm_mask_hi);
4194 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4198 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4208 uint8_t m = *mask++;
4212 *dst = pack_1x64_32 (
4214 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4229 /*-----------------------------------------------------------------------
4230 * composite_over_n_8_0565
4234 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4236 pixman_image_t * src_image,
4237 pixman_image_t * mask_image,
4238 pixman_image_t * dst_image,
4249 uint16_t *dst_line, *dst, d;
4250 uint8_t *mask_line, *mask;
4251 int dst_stride, mask_stride;
4254 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4256 __m128i xmm_src, xmm_alpha;
4257 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4258 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4260 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4266 PIXMAN_IMAGE_GET_LINE (
4267 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4268 PIXMAN_IMAGE_GET_LINE (
4269 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4271 xmm_src = expand_pixel_32_1x128 (src);
4272 xmm_alpha = expand_alpha_1x128 (xmm_src);
4273 mmx_src = _mm_movepi64_pi64 (xmm_src);
4274 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4279 dst_line += dst_stride;
4281 mask_line += mask_stride;
4284 /* call prefetch hint to optimize cache load*/
4285 cache_prefetch ((__m128i*)mask);
4286 cache_prefetch ((__m128i*)dst);
4288 while (w && (unsigned long)dst & 15)
4295 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4296 mmx_dest = expand565_16_1x64 (d);
4298 *dst = pack_565_32_16 (
4301 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4308 /* call prefetch hint to optimize cache load*/
4309 cache_prefetch ((__m128i*)mask);
4310 cache_prefetch ((__m128i*)dst);
4314 /* fill cache line with next memory */
4315 cache_prefetch_next ((__m128i*)mask);
4316 cache_prefetch_next ((__m128i*)dst);
4318 xmm_dst = load_128_aligned ((__m128i*) dst);
4319 unpack_565_128_4x128 (xmm_dst,
4320 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4322 m = *((uint32_t*)mask);
4327 xmm_mask = unpack_32_1x128 (m);
4328 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4331 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4333 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4334 &xmm_mask_lo, &xmm_mask_hi);
4336 in_over_2x128 (&xmm_src, &xmm_src,
4337 &xmm_alpha, &xmm_alpha,
4338 &xmm_mask_lo, &xmm_mask_hi,
4339 &xmm_dst0, &xmm_dst1);
4342 m = *((uint32_t*)mask);
4347 xmm_mask = unpack_32_1x128 (m);
4348 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4351 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4353 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4354 &xmm_mask_lo, &xmm_mask_hi);
4355 in_over_2x128 (&xmm_src, &xmm_src,
4356 &xmm_alpha, &xmm_alpha,
4357 &xmm_mask_lo, &xmm_mask_hi,
4358 &xmm_dst2, &xmm_dst3);
4362 (__m128i*)dst, pack_565_4x128_128 (
4363 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4376 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4377 mmx_dest = expand565_16_1x64 (d);
4379 *dst = pack_565_32_16 (
4382 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4393 /* -----------------------------------------------------------------------
4394 * composite_over_pixbuf_0565
4398 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4400 pixman_image_t * src_image,
4401 pixman_image_t * mask_image,
4402 pixman_image_t * dst_image,
4412 uint16_t *dst_line, *dst, d;
4413 uint32_t *src_line, *src, s;
4414 int dst_stride, src_stride;
4416 uint32_t opaque, zero;
4419 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4420 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4422 PIXMAN_IMAGE_GET_LINE (
4423 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4424 PIXMAN_IMAGE_GET_LINE (
4425 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4430 * I copy the code from MMX one and keep the fixme.
4431 * If it's a problem there, probably is a problem here.
4433 assert (src_image->drawable == mask_image->drawable);
4439 dst_line += dst_stride;
4441 src_line += src_stride;
4444 /* call prefetch hint to optimize cache load*/
4445 cache_prefetch ((__m128i*)src);
4446 cache_prefetch ((__m128i*)dst);
4448 while (w && (unsigned long)dst & 15)
4453 ms = unpack_32_1x64 (s);
4455 *dst++ = pack_565_32_16 (
4457 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4461 /* call prefetch hint to optimize cache load*/
4462 cache_prefetch ((__m128i*)src);
4463 cache_prefetch ((__m128i*)dst);
4467 /* fill cache line with next memory */
4468 cache_prefetch_next ((__m128i*)src);
4469 cache_prefetch_next ((__m128i*)dst);
4472 xmm_src = load_128_unaligned ((__m128i*)src);
4473 xmm_dst = load_128_aligned ((__m128i*)dst);
4475 opaque = is_opaque (xmm_src);
4476 zero = is_zero (xmm_src);
4478 unpack_565_128_4x128 (xmm_dst,
4479 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4480 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4482 /* preload next round*/
4483 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4487 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4488 &xmm_dst0, &xmm_dst1);
4492 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4493 &xmm_dst0, &xmm_dst1);
4497 opaque = is_opaque (xmm_src);
4498 zero = is_zero (xmm_src);
4500 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4504 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4505 &xmm_dst2, &xmm_dst3);
4509 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4510 &xmm_dst2, &xmm_dst3);
4514 (__m128i*)dst, pack_565_4x128_128 (
4515 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4527 ms = unpack_32_1x64 (s);
4529 *dst++ = pack_565_32_16 (
4531 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4539 /* -------------------------------------------------------------------------
4540 * composite_over_pixbuf_8888
4544 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4546 pixman_image_t * src_image,
4547 pixman_image_t * mask_image,
4548 pixman_image_t * dst_image,
4558 uint32_t *dst_line, *dst, d;
4559 uint32_t *src_line, *src, s;
4560 int dst_stride, src_stride;
4562 uint32_t opaque, zero;
4564 __m128i xmm_src_lo, xmm_src_hi;
4565 __m128i xmm_dst_lo, xmm_dst_hi;
4567 PIXMAN_IMAGE_GET_LINE (
4568 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4569 PIXMAN_IMAGE_GET_LINE (
4570 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4575 * I copy the code from MMX one and keep the fixme.
4576 * If it's a problem there, probably is a problem here.
4578 assert (src_image->drawable == mask_image->drawable);
4584 dst_line += dst_stride;
4586 src_line += src_stride;
4589 /* call prefetch hint to optimize cache load*/
4590 cache_prefetch ((__m128i*)src);
4591 cache_prefetch ((__m128i*)dst);
4593 while (w && (unsigned long)dst & 15)
4598 *dst++ = pack_1x64_32 (
4599 over_rev_non_pre_1x64 (
4600 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4605 /* call prefetch hint to optimize cache load*/
4606 cache_prefetch ((__m128i*)src);
4607 cache_prefetch ((__m128i*)dst);
4611 /* fill cache line with next memory */
4612 cache_prefetch_next ((__m128i*)src);
4613 cache_prefetch_next ((__m128i*)dst);
4615 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4617 opaque = is_opaque (xmm_src_hi);
4618 zero = is_zero (xmm_src_hi);
4620 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4624 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4625 &xmm_dst_lo, &xmm_dst_hi);
4628 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4632 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4634 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4636 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4637 &xmm_dst_lo, &xmm_dst_hi);
4640 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4653 *dst++ = pack_1x64_32 (
4654 over_rev_non_pre_1x64 (
4655 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4664 /* -------------------------------------------------------------------------------------------------
4665 * composite_over_n_8888_0565_ca
4669 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4671 pixman_image_t * src_image,
4672 pixman_image_t * mask_image,
4673 pixman_image_t * dst_image,
4684 uint16_t *dst_line, *dst, d;
4685 uint32_t *mask_line, *mask, m;
4686 int dst_stride, mask_stride;
4690 __m128i xmm_src, xmm_alpha;
4691 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4692 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4694 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4696 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4701 PIXMAN_IMAGE_GET_LINE (
4702 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4703 PIXMAN_IMAGE_GET_LINE (
4704 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4706 xmm_src = expand_pixel_32_1x128 (src);
4707 xmm_alpha = expand_alpha_1x128 (xmm_src);
4708 mmx_src = _mm_movepi64_pi64 (xmm_src);
4709 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4716 mask_line += mask_stride;
4717 dst_line += dst_stride;
4719 /* call prefetch hint to optimize cache load*/
4720 cache_prefetch ((__m128i*)mask);
4721 cache_prefetch ((__m128i*)dst);
4723 while (w && ((unsigned long)dst & 15))
4725 m = *(uint32_t *) mask;
4730 mmx_mask = unpack_32_1x64 (m);
4731 mmx_dest = expand565_16_1x64 (d);
4733 *dst = pack_565_32_16 (
4736 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4744 /* call prefetch hint to optimize cache load*/
4745 cache_prefetch ((__m128i*)mask);
4746 cache_prefetch ((__m128i*)dst);
4750 /* fill cache line with next memory */
4751 cache_prefetch_next ((__m128i*)mask);
4752 cache_prefetch_next ((__m128i*)dst);
4755 xmm_mask = load_128_unaligned ((__m128i*)mask);
4756 xmm_dst = load_128_aligned ((__m128i*)dst);
4758 pack_cmp = _mm_movemask_epi8 (
4759 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4761 unpack_565_128_4x128 (xmm_dst,
4762 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4763 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4765 /* preload next round */
4766 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4768 /* preload next round */
4769 if (pack_cmp != 0xffff)
4771 in_over_2x128 (&xmm_src, &xmm_src,
4772 &xmm_alpha, &xmm_alpha,
4773 &xmm_mask_lo, &xmm_mask_hi,
4774 &xmm_dst0, &xmm_dst1);
4778 pack_cmp = _mm_movemask_epi8 (
4779 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4781 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4783 if (pack_cmp != 0xffff)
4785 in_over_2x128 (&xmm_src, &xmm_src,
4786 &xmm_alpha, &xmm_alpha,
4787 &xmm_mask_lo, &xmm_mask_hi,
4788 &xmm_dst2, &xmm_dst3);
4792 (__m128i*)dst, pack_565_4x128_128 (
4793 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4802 m = *(uint32_t *) mask;
4807 mmx_mask = unpack_32_1x64 (m);
4808 mmx_dest = expand565_16_1x64 (d);
4810 *dst = pack_565_32_16 (
4813 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4825 /* -----------------------------------------------------------------------
4826 * composite_in_n_8_8
4830 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4832 pixman_image_t * src_image,
4833 pixman_image_t * mask_image,
4834 pixman_image_t * dst_image,
4844 uint8_t *dst_line, *dst;
4845 uint8_t *mask_line, *mask;
4846 int dst_stride, mask_stride;
4852 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4853 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4855 PIXMAN_IMAGE_GET_LINE (
4856 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4857 PIXMAN_IMAGE_GET_LINE (
4858 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4860 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4864 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4869 dst_line += dst_stride;
4871 mask_line += mask_stride;
4874 /* call prefetch hint to optimize cache load*/
4875 cache_prefetch ((__m128i*)mask);
4876 cache_prefetch ((__m128i*)dst);
4878 while (w && ((unsigned long)dst & 15))
4880 m = (uint32_t) *mask++;
4881 d = (uint32_t) *dst;
4883 *dst++ = (uint8_t) pack_1x64_32 (
4885 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4886 unpack_32_1x64 (m)),
4887 unpack_32_1x64 (d)));
4891 /* call prefetch hint to optimize cache load*/
4892 cache_prefetch ((__m128i*)mask);
4893 cache_prefetch ((__m128i*)dst);
4897 /* fill cache line with next memory */
4898 cache_prefetch_next ((__m128i*)mask);
4899 cache_prefetch_next ((__m128i*)dst);
4901 xmm_mask = load_128_unaligned ((__m128i*)mask);
4902 xmm_dst = load_128_aligned ((__m128i*)dst);
4904 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4905 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4907 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4908 &xmm_mask_lo, &xmm_mask_hi,
4909 &xmm_mask_lo, &xmm_mask_hi);
4911 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4912 &xmm_dst_lo, &xmm_dst_hi,
4913 &xmm_dst_lo, &xmm_dst_hi);
4916 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4925 m = (uint32_t) *mask++;
4926 d = (uint32_t) *dst;
4928 *dst++ = (uint8_t) pack_1x64_32 (
4931 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4932 unpack_32_1x64 (d)));
4940 /* ---------------------------------------------------------------------------
4945 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4947 pixman_image_t * src_image,
4948 pixman_image_t * mask_image,
4949 pixman_image_t * dst_image,
4959 uint8_t *dst_line, *dst;
4960 uint8_t *src_line, *src;
4961 int src_stride, dst_stride;
4965 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4966 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4968 PIXMAN_IMAGE_GET_LINE (
4969 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4970 PIXMAN_IMAGE_GET_LINE (
4971 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4976 dst_line += dst_stride;
4978 src_line += src_stride;
4981 /* call prefetch hint to optimize cache load*/
4982 cache_prefetch ((__m128i*)src);
4983 cache_prefetch ((__m128i*)dst);
4985 while (w && ((unsigned long)dst & 15))
4987 s = (uint32_t) *src++;
4988 d = (uint32_t) *dst;
4990 *dst++ = (uint8_t) pack_1x64_32 (
4992 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4996 /* call prefetch hint to optimize cache load*/
4997 cache_prefetch ((__m128i*)src);
4998 cache_prefetch ((__m128i*)dst);
5002 /* fill cache line with next memory */
5003 cache_prefetch_next ((__m128i*)src);
5004 cache_prefetch_next ((__m128i*)dst);
5006 xmm_src = load_128_unaligned ((__m128i*)src);
5007 xmm_dst = load_128_aligned ((__m128i*)dst);
5009 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5010 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5012 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5013 &xmm_dst_lo, &xmm_dst_hi,
5014 &xmm_dst_lo, &xmm_dst_hi);
5017 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5026 s = (uint32_t) *src++;
5027 d = (uint32_t) *dst;
5029 *dst++ = (uint8_t) pack_1x64_32 (
5030 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5038 /* -------------------------------------------------------------------------
5039 * composite_add_n_8_8
5043 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
5045 pixman_image_t * src_image,
5046 pixman_image_t * mask_image,
5047 pixman_image_t * dst_image,
5057 uint8_t *dst_line, *dst;
5058 uint8_t *mask_line, *mask;
5059 int dst_stride, mask_stride;
5066 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5067 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5069 PIXMAN_IMAGE_GET_LINE (
5070 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5071 PIXMAN_IMAGE_GET_LINE (
5072 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5074 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5078 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5083 dst_line += dst_stride;
5085 mask_line += mask_stride;
5088 /* call prefetch hint to optimize cache load*/
5089 cache_prefetch ((__m128i*)mask);
5090 cache_prefetch ((__m128i*)dst);
5092 while (w && ((unsigned long)dst & 15))
5094 m = (uint32_t) *mask++;
5095 d = (uint32_t) *dst;
5097 *dst++ = (uint8_t) pack_1x64_32 (
5100 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5101 unpack_32_1x64 (d)));
5105 /* call prefetch hint to optimize cache load*/
5106 cache_prefetch ((__m128i*)mask);
5107 cache_prefetch ((__m128i*)dst);
5111 /* fill cache line with next memory */
5112 cache_prefetch_next ((__m128i*)mask);
5113 cache_prefetch_next ((__m128i*)dst);
5115 xmm_mask = load_128_unaligned ((__m128i*)mask);
5116 xmm_dst = load_128_aligned ((__m128i*)dst);
5118 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5119 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5121 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5122 &xmm_mask_lo, &xmm_mask_hi,
5123 &xmm_mask_lo, &xmm_mask_hi);
5125 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5126 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5129 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5138 m = (uint32_t) *mask++;
5139 d = (uint32_t) *dst;
5141 *dst++ = (uint8_t) pack_1x64_32 (
5144 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5145 unpack_32_1x64 (d)));
5154 /* ----------------------------------------------------------------------
5155 * composite_add_8000_8000
5159 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5161 pixman_image_t * src_image,
5162 pixman_image_t * mask_image,
5163 pixman_image_t * dst_image,
5173 uint8_t *dst_line, *dst;
5174 uint8_t *src_line, *src;
5175 int dst_stride, src_stride;
5179 PIXMAN_IMAGE_GET_LINE (
5180 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5181 PIXMAN_IMAGE_GET_LINE (
5182 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5189 /* call prefetch hint to optimize cache load*/
5190 cache_prefetch ((__m128i*)src);
5191 cache_prefetch ((__m128i*)dst);
5193 dst_line += dst_stride;
5194 src_line += src_stride;
5198 while (w && (unsigned long)dst & 3)
5200 t = (*dst) + (*src++);
5201 *dst++ = t | (0 - (t >> 8));
5205 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5215 t = (*dst) + (*src++);
5216 *dst++ = t | (0 - (t >> 8));
5224 /* ---------------------------------------------------------------------
5225 * composite_add_8888_8888
5228 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5230 pixman_image_t * src_image,
5231 pixman_image_t * mask_image,
5232 pixman_image_t * dst_image,
5242 uint32_t *dst_line, *dst;
5243 uint32_t *src_line, *src;
5244 int dst_stride, src_stride;
5246 PIXMAN_IMAGE_GET_LINE (
5247 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5248 PIXMAN_IMAGE_GET_LINE (
5249 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5254 dst_line += dst_stride;
5256 src_line += src_stride;
5258 core_combine_add_u_sse2 (dst, src, NULL, width);
5264 /* -------------------------------------------------------------------------------------------------
5265 * sse2_composite_copy_area
5268 static pixman_bool_t
5269 pixman_blt_sse2 (uint32_t *src_bits,
5282 uint8_t * src_bytes;
5283 uint8_t * dst_bytes;
5286 if (src_bpp != dst_bpp)
5291 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5292 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5293 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5294 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5295 byte_width = 2 * width;
5299 else if (src_bpp == 32)
5301 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5302 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5303 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5304 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5305 byte_width = 4 * width;
5314 cache_prefetch ((__m128i*)src_bytes);
5315 cache_prefetch ((__m128i*)dst_bytes);
5320 uint8_t *s = src_bytes;
5321 uint8_t *d = dst_bytes;
5322 src_bytes += src_stride;
5323 dst_bytes += dst_stride;
5326 cache_prefetch_next ((__m128i*)s);
5327 cache_prefetch_next ((__m128i*)d);
5329 while (w >= 2 && ((unsigned long)d & 3))
5331 *(uint16_t *)d = *(uint16_t *)s;
5337 while (w >= 4 && ((unsigned long)d & 15))
5339 *(uint32_t *)d = *(uint32_t *)s;
5346 cache_prefetch_next ((__m128i*)s);
5347 cache_prefetch_next ((__m128i*)d);
5351 __m128i xmm0, xmm1, xmm2, xmm3;
5353 /* 128 bytes ahead */
5354 cache_prefetch (((__m128i*)s) + 8);
5355 cache_prefetch (((__m128i*)d) + 8);
5357 xmm0 = load_128_unaligned ((__m128i*)(s));
5358 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5359 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5360 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5362 save_128_aligned ((__m128i*)(d), xmm0);
5363 save_128_aligned ((__m128i*)(d + 16), xmm1);
5364 save_128_aligned ((__m128i*)(d + 32), xmm2);
5365 save_128_aligned ((__m128i*)(d + 48), xmm3);
5372 cache_prefetch_next ((__m128i*)s);
5373 cache_prefetch_next ((__m128i*)d);
5377 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5384 cache_prefetch_next ((__m128i*)s);
5385 cache_prefetch_next ((__m128i*)d);
5389 *(uint32_t *)d = *(uint32_t *)s;
5398 *(uint16_t *)d = *(uint16_t *)s;
5411 sse2_composite_copy_area (pixman_implementation_t *imp,
5413 pixman_image_t * src_image,
5414 pixman_image_t * mask_image,
5415 pixman_image_t * dst_image,
5425 pixman_blt_sse2 (src_image->bits.bits,
5426 dst_image->bits.bits,
5427 src_image->bits.rowstride,
5428 dst_image->bits.rowstride,
5429 PIXMAN_FORMAT_BPP (src_image->bits.format),
5430 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5431 src_x, src_y, dest_x, dest_y, width, height);
5435 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5437 pixman_image_t * src_image,
5438 pixman_image_t * mask_image,
5439 pixman_image_t * dst_image,
5449 uint32_t *src, *src_line, s;
5450 uint32_t *dst, *dst_line, d;
5451 uint8_t *mask, *mask_line;
5453 int src_stride, mask_stride, dst_stride;
5456 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5457 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5458 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5460 PIXMAN_IMAGE_GET_LINE (
5461 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5462 PIXMAN_IMAGE_GET_LINE (
5463 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5464 PIXMAN_IMAGE_GET_LINE (
5465 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5470 src_line += src_stride;
5472 dst_line += dst_stride;
5474 mask_line += mask_stride;
5478 /* call prefetch hint to optimize cache load*/
5479 cache_prefetch ((__m128i*)src);
5480 cache_prefetch ((__m128i*)dst);
5481 cache_prefetch ((__m128i*)mask);
5483 while (w && (unsigned long)dst & 15)
5485 s = 0xff000000 | *src++;
5486 m = (uint32_t) *mask++;
5489 __m64 ms = unpack_32_1x64 (s);
5493 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5494 __m64 md = unpack_32_1x64 (d);
5496 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5499 *dst++ = pack_1x64_32 (ms);
5503 /* call prefetch hint to optimize cache load*/
5504 cache_prefetch ((__m128i*)src);
5505 cache_prefetch ((__m128i*)dst);
5506 cache_prefetch ((__m128i*)mask);
5510 /* fill cache line with next memory */
5511 cache_prefetch_next ((__m128i*)src);
5512 cache_prefetch_next ((__m128i*)dst);
5513 cache_prefetch_next ((__m128i*)mask);
5515 m = *(uint32_t*) mask;
5516 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5518 if (m == 0xffffffff)
5520 save_128_aligned ((__m128i*)dst, xmm_src);
5524 xmm_dst = load_128_aligned ((__m128i*)dst);
5526 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5528 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5529 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5530 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5532 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5534 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5536 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5547 m = (uint32_t) *mask++;
5551 s = 0xff000000 | *src;
5563 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5564 md = unpack_32_1x64 (d);
5565 ms = unpack_32_1x64 (s);
5567 *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5581 static const pixman_fast_path_t sse2_fast_paths[] =
5583 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 },
5584 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 },
5585 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 },
5586 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 },
5587 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 },
5588 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
5589 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
5590 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
5591 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
5592 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
5593 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
5594 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5595 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5596 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5597 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5598 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5599 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5600 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
5601 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
5602 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5603 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5604 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5605 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5606 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5607 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5608 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5609 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5610 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5611 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5612 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5613 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5614 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5615 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5616 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5617 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5618 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5619 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5620 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5621 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5622 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5623 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5624 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5625 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5626 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5627 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5628 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5629 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5631 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5632 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 },
5633 { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 },
5634 { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 },
5635 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_n_8_8, 0 },
5637 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5638 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5639 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5640 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5641 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 },
5642 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 },
5643 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5644 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5645 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5646 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5647 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 },
5648 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 },
5650 { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 },
5651 { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 },
5657 * Work around GCC bug causing crashes in Mozilla with SSE2
5659 * When using -msse, gcc generates movdqa instructions assuming that
5660 * the stack is 16 byte aligned. Unfortunately some applications, such
5661 * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
5662 * causes the movdqa instructions to fail.
5664 * The __force_align_arg_pointer__ makes gcc generate a prologue that
5665 * realigns the stack pointer to 16 bytes.
5667 * On x86-64 this is not necessary because the standard ABI already
5668 * calls for a 16 byte aligned stack.
5670 * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
5672 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5673 __attribute__((__force_align_arg_pointer__))
5676 sse2_composite (pixman_implementation_t *imp,
5678 pixman_image_t * src,
5679 pixman_image_t * mask,
5680 pixman_image_t * dest,
5690 if (_pixman_run_fast_path (sse2_fast_paths, imp,
5691 op, src, mask, dest,
5700 _pixman_implementation_composite (imp->delegate, op,
5708 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5709 __attribute__((__force_align_arg_pointer__))
5711 static pixman_bool_t
5712 sse2_blt (pixman_implementation_t *imp,
5713 uint32_t * src_bits,
5714 uint32_t * dst_bits,
5726 if (!pixman_blt_sse2 (
5727 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5728 src_x, src_y, dst_x, dst_y, width, height))
5731 return _pixman_implementation_blt (
5733 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5734 src_x, src_y, dst_x, dst_y, width, height);
5740 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5741 __attribute__((__force_align_arg_pointer__))
5743 static pixman_bool_t
5744 sse2_fill (pixman_implementation_t *imp,
5754 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5756 return _pixman_implementation_fill (
5757 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5763 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5764 __attribute__((__force_align_arg_pointer__))
5766 pixman_implementation_t *
5767 _pixman_implementation_create_sse2 (void)
5769 pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5770 pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5772 /* SSE2 constants */
5773 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5774 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5775 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5776 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5777 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5778 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5779 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5780 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5781 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
5782 mask_0080 = create_mask_16_128 (0x0080);
5783 mask_00ff = create_mask_16_128 (0x00ff);
5784 mask_0101 = create_mask_16_128 (0x0101);
5785 mask_ffff = create_mask_16_128 (0xffff);
5786 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5787 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5790 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5791 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5793 mask_x0080 = create_mask_16_64 (0x0080);
5794 mask_x00ff = create_mask_16_64 (0x00ff);
5795 mask_x0101 = create_mask_16_64 (0x0101);
5796 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5800 /* Set up function pointers */
5802 /* SSE code patch for fbcompose.c */
5803 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5804 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5805 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5806 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5807 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5808 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5809 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5810 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5811 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5812 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5814 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5816 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5817 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5818 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5819 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5820 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5821 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5822 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5823 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5824 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5825 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5826 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5828 imp->composite = sse2_composite;
5829 imp->blt = sse2_blt;
5830 imp->fill = sse2_fill;
5835 #endif /* USE_SSE2 */