2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41 * the pixman-x64-mmx-emulation.h file contains
42 * implementations of those MMX intrinsics that
43 * are used in the SSE2 implementation.
45 # include "pixman-x64-mmx-emulation.h"
50 /* --------------------------------------------------------------------
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
79 /* ----------------------------------------------------------------------
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
85 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
91 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
98 __m128i r, g, b, rb, t;
100 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
104 rb = _mm_or_si128 (r, b);
105 t = _mm_and_si128 (rb, mask_565_fix_rb);
106 t = _mm_srli_epi32 (t, 5);
107 rb = _mm_or_si128 (rb, t);
109 t = _mm_and_si128 (g, mask_565_fix_g);
110 t = _mm_srli_epi32 (t, 6);
111 g = _mm_or_si128 (g, t);
113 return _mm_or_si128 (rb, g);
116 static force_inline void
117 unpack_565_128_4x128 (__m128i data,
125 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
128 lo = unpack_565_to_8888 (lo);
129 hi = unpack_565_to_8888 (hi);
131 unpack_128_2x128 (lo, data0, data1);
132 unpack_128_2x128 (hi, data2, data3);
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
138 return (uint16_t) (((pixel >> 8) & 0xf800) |
139 ((pixel >> 5) & 0x07e0) |
140 ((pixel >> 3) & 0x001f));
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
146 return _mm_packus_epi16 (lo, hi);
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
153 __m128i r, g1, g2, b;
155 data = pack_2x128_128 (lo, hi);
157 r = _mm_and_si128 (data, mask_565_r);
158 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
162 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
168 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169 pack_565_2x128_128 (*xmm2, *xmm3));
172 static force_inline int
173 is_opaque (__m128i x)
175 __m128i ffs = _mm_cmpeq_epi8 (x, x);
177 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
180 static force_inline int
183 return _mm_movemask_epi8 (
184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
187 static force_inline int
188 is_transparent (__m128i x)
190 return (_mm_movemask_epi8 (
191 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
197 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
203 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204 _MM_SHUFFLE (3, 3, 3, 3)),
205 _MM_SHUFFLE (3, 3, 3, 3));
208 static force_inline void
209 expand_alpha_2x128 (__m128i data_lo,
216 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
219 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i data_lo,
231 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
247 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249 lo = _mm_adds_epu16 (lo, mask_0080);
250 hi = _mm_adds_epu16 (hi, mask_0080);
251 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
258 __m128i* alpha_dst_lo,
259 __m128i* alpha_dst_hi,
262 __m128i* alpha_src_lo,
263 __m128i* alpha_src_hi,
267 __m128i t1_lo, t1_hi;
268 __m128i t2_lo, t2_hi;
270 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
273 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
277 static force_inline void
278 negate_2x128 (__m128i data_lo,
283 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
287 static force_inline void
288 invert_colors_2x128 (__m128i data_lo,
295 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
311 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
313 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
315 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i src_lo,
326 __m128i alpha_lo, alpha_hi;
328 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
330 lo = _mm_or_si128 (alpha_lo, mask_alpha);
331 hi = _mm_or_si128 (alpha_hi, mask_alpha);
333 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
335 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
337 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
353 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
356 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
359 static force_inline void
360 cache_prefetch (__m128i* addr)
362 _mm_prefetch ((void const*)addr, _MM_HINT_T0);
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
368 _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
371 /* prefetching NULL is very slow on some systems. don't do that. */
373 static force_inline void
374 maybe_prefetch (__m128i* addr)
377 cache_prefetch (addr);
380 static force_inline void
381 maybe_prefetch_next (__m128i* addr)
384 cache_prefetch_next (addr);
387 /* load 4 pixels from a 16-byte boundary aligned address */
388 static force_inline __m128i
389 load_128_aligned (__m128i* src)
391 return _mm_load_si128 (src);
394 /* load 4 pixels from a unaligned address */
395 static force_inline __m128i
396 load_128_unaligned (const __m128i* src)
398 return _mm_loadu_si128 (src);
401 /* save 4 pixels using Write Combining memory on a 16-byte
402 * boundary aligned address
404 static force_inline void
405 save_128_write_combining (__m128i* dst,
408 _mm_stream_si128 (dst, data);
411 /* save 4 pixels on a 16-byte boundary aligned address */
412 static force_inline void
413 save_128_aligned (__m128i* dst,
416 _mm_store_si128 (dst, data);
419 /* save 4 pixels on a unaligned address */
420 static force_inline void
421 save_128_unaligned (__m128i* dst,
424 _mm_storeu_si128 (dst, data);
427 /* ------------------------------------------------------------------
431 static force_inline __m64
432 unpack_32_1x64 (uint32_t data)
434 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
437 static force_inline __m64
438 expand_alpha_1x64 (__m64 data)
440 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
443 static force_inline __m64
444 expand_alpha_rev_1x64 (__m64 data)
446 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
449 static force_inline __m64
450 expand_pixel_8_1x64 (uint8_t data)
452 return _mm_shuffle_pi16 (
453 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
456 static force_inline __m64
457 pix_multiply_1x64 (__m64 data,
460 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
465 static force_inline __m64
466 pix_add_multiply_1x64 (__m64* src,
471 __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
472 __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
474 return _mm_adds_pu8 (t1, t2);
477 static force_inline __m64
478 negate_1x64 (__m64 data)
480 return _mm_xor_si64 (data, mask_x00ff);
483 static force_inline __m64
484 invert_colors_1x64 (__m64 data)
486 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
489 static force_inline __m64
490 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
492 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
495 static force_inline __m64
496 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
498 return over_1x64 (pix_multiply_1x64 (*src, *mask),
499 pix_multiply_1x64 (*alpha, *mask),
503 static force_inline __m64
504 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
506 __m64 alpha = expand_alpha_1x64 (src);
508 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
509 _mm_or_si64 (alpha, mask_x_alpha)),
514 static force_inline uint32_t
515 pack_1x64_32 (__m64 data)
517 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
520 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
524 * --- Expanding 565 in the low word ---
526 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
527 * m = m & (01f0003f001f);
528 * m = m * (008404100840);
531 * Note the trick here - the top word is shifted by another nibble to
532 * avoid it bumping into the middle word
534 static force_inline __m64
535 expand565_16_1x64 (uint16_t pixel)
540 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
542 t1 = _mm_slli_si64 (p, 36 - 11);
543 t2 = _mm_slli_si64 (p, 16 - 5);
545 p = _mm_or_si64 (t1, p);
546 p = _mm_or_si64 (t2, p);
547 p = _mm_and_si64 (p, mask_x565_rgb);
548 p = _mm_mullo_pi16 (p, mask_x565_unpack);
550 return _mm_srli_pi16 (p, 8);
553 /* ----------------------------------------------------------------------------
554 * Compose Core transformations
556 static force_inline uint32_t
557 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
570 ms = unpack_32_1x64 (src);
571 return pack_1x64_32 (
572 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
578 static force_inline uint32_t
579 combine1 (const uint32_t *ps, const uint32_t *pm)
587 mm = unpack_32_1x64 (*pm);
588 mm = expand_alpha_1x64 (mm);
590 ms = unpack_32_1x64 (s);
591 ms = pix_multiply_1x64 (ms, mm);
593 s = pack_1x64_32 (ms);
599 static force_inline __m128i
600 combine4 (const __m128i *ps, const __m128i *pm)
602 __m128i xmm_src_lo, xmm_src_hi;
603 __m128i xmm_msk_lo, xmm_msk_hi;
608 xmm_msk_lo = load_128_unaligned (pm);
610 if (is_transparent (xmm_msk_lo))
611 return _mm_setzero_si128 ();
614 s = load_128_unaligned (ps);
618 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
619 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
621 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
623 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
624 &xmm_msk_lo, &xmm_msk_hi,
625 &xmm_src_lo, &xmm_src_hi);
627 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
633 static force_inline void
634 core_combine_over_u_sse2 (uint32_t* pd,
641 __m128i xmm_dst_lo, xmm_dst_hi;
642 __m128i xmm_src_lo, xmm_src_hi;
643 __m128i xmm_alpha_lo, xmm_alpha_hi;
645 /* call prefetch hint to optimize cache load*/
646 cache_prefetch ((__m128i*)ps);
647 cache_prefetch ((__m128i*)pd);
648 maybe_prefetch ((__m128i*)pm);
650 /* Align dst on a 16-byte boundary */
651 while (w && ((unsigned long)pd & 15))
654 s = combine1 (ps, pm);
656 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
663 /* call prefetch hint to optimize cache load*/
664 cache_prefetch ((__m128i*)ps);
665 cache_prefetch ((__m128i*)pd);
666 maybe_prefetch ((__m128i*)pm);
670 /* fill cache line with next memory */
671 cache_prefetch_next ((__m128i*)ps);
672 cache_prefetch_next ((__m128i*)pd);
673 maybe_prefetch_next ((__m128i*)pm);
675 /* I'm loading unaligned because I'm not sure about
676 * the address alignment.
678 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
680 if (is_opaque (xmm_src_hi))
682 save_128_aligned ((__m128i*)pd, xmm_src_hi);
684 else if (!is_zero (xmm_src_hi))
686 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
688 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
689 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
692 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
694 over_2x128 (&xmm_src_lo, &xmm_src_hi,
695 &xmm_alpha_lo, &xmm_alpha_hi,
696 &xmm_dst_lo, &xmm_dst_hi);
698 /* rebuid the 4 pixel data and save*/
699 save_128_aligned ((__m128i*)pd,
700 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
713 s = combine1 (ps, pm);
715 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
724 static force_inline void
725 core_combine_over_reverse_u_sse2 (uint32_t* pd,
732 __m128i xmm_dst_lo, xmm_dst_hi;
733 __m128i xmm_src_lo, xmm_src_hi;
734 __m128i xmm_alpha_lo, xmm_alpha_hi;
736 /* call prefetch hint to optimize cache load*/
737 cache_prefetch ((__m128i*)ps);
738 cache_prefetch ((__m128i*)pd);
739 maybe_prefetch ((__m128i*)pm);
741 /* Align dst on a 16-byte boundary */
743 ((unsigned long)pd & 15))
746 s = combine1 (ps, pm);
748 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
755 /* call prefetch hint to optimize cache load*/
756 cache_prefetch ((__m128i*)ps);
757 cache_prefetch ((__m128i*)pd);
758 maybe_prefetch ((__m128i*)pm);
762 /* fill cache line with next memory */
763 cache_prefetch_next ((__m128i*)ps);
764 cache_prefetch_next ((__m128i*)pd);
765 maybe_prefetch_next ((__m128i*)pm);
767 /* I'm loading unaligned because I'm not sure
768 * about the address alignment.
770 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
771 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
773 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
774 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
776 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
777 &xmm_alpha_lo, &xmm_alpha_hi);
779 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
780 &xmm_alpha_lo, &xmm_alpha_hi,
781 &xmm_src_lo, &xmm_src_hi);
783 /* rebuid the 4 pixel data and save*/
784 save_128_aligned ((__m128i*)pd,
785 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
798 s = combine1 (ps, pm);
800 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
808 static force_inline uint32_t
809 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
811 uint32_t maska = src >> 24;
817 else if (maska != 0xff)
819 return pack_1x64_32 (
820 pix_multiply_1x64 (unpack_32_1x64 (dst),
821 expand_alpha_1x64 (unpack_32_1x64 (src))));
827 static force_inline void
828 core_combine_in_u_sse2 (uint32_t* pd,
835 __m128i xmm_src_lo, xmm_src_hi;
836 __m128i xmm_dst_lo, xmm_dst_hi;
838 /* call prefetch hint to optimize cache load*/
839 cache_prefetch ((__m128i*)ps);
840 cache_prefetch ((__m128i*)pd);
841 maybe_prefetch ((__m128i*)pm);
843 while (w && ((unsigned long) pd & 15))
845 s = combine1 (ps, pm);
848 *pd++ = core_combine_in_u_pixelsse2 (d, s);
855 /* call prefetch hint to optimize cache load*/
856 cache_prefetch ((__m128i*)ps);
857 cache_prefetch ((__m128i*)pd);
858 maybe_prefetch ((__m128i*)pm);
862 /* fill cache line with next memory */
863 cache_prefetch_next ((__m128i*)ps);
864 cache_prefetch_next ((__m128i*)pd);
865 maybe_prefetch_next ((__m128i*)pm);
867 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
868 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
870 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
871 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
873 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
874 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
875 &xmm_dst_lo, &xmm_dst_hi,
876 &xmm_dst_lo, &xmm_dst_hi);
878 save_128_aligned ((__m128i*)pd,
879 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
890 s = combine1 (ps, pm);
893 *pd++ = core_combine_in_u_pixelsse2 (d, s);
901 static force_inline void
902 core_combine_reverse_in_u_sse2 (uint32_t* pd,
909 __m128i xmm_src_lo, xmm_src_hi;
910 __m128i xmm_dst_lo, xmm_dst_hi;
912 /* call prefetch hint to optimize cache load*/
913 cache_prefetch ((__m128i*)ps);
914 cache_prefetch ((__m128i*)pd);
915 maybe_prefetch ((__m128i*)pm);
917 while (w && ((unsigned long) pd & 15))
919 s = combine1 (ps, pm);
922 *pd++ = core_combine_in_u_pixelsse2 (s, d);
929 /* call prefetch hint to optimize cache load*/
930 cache_prefetch ((__m128i*)ps);
931 cache_prefetch ((__m128i*)pd);
932 maybe_prefetch ((__m128i*)pm);
936 /* fill cache line with next memory */
937 cache_prefetch_next ((__m128i*)ps);
938 cache_prefetch_next ((__m128i*)pd);
939 maybe_prefetch_next ((__m128i*)pm);
941 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
942 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
944 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
945 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
947 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
948 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
949 &xmm_src_lo, &xmm_src_hi,
950 &xmm_dst_lo, &xmm_dst_hi);
953 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
964 s = combine1 (ps, pm);
967 *pd++ = core_combine_in_u_pixelsse2 (s, d);
975 static force_inline void
976 core_combine_reverse_out_u_sse2 (uint32_t* pd,
981 /* call prefetch hint to optimize cache load*/
982 cache_prefetch ((__m128i*)ps);
983 cache_prefetch ((__m128i*)pd);
984 maybe_prefetch ((__m128i*)pm);
986 while (w && ((unsigned long) pd & 15))
988 uint32_t s = combine1 (ps, pm);
991 *pd++ = pack_1x64_32 (
993 unpack_32_1x64 (d), negate_1x64 (
994 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1002 /* call prefetch hint to optimize cache load*/
1003 cache_prefetch ((__m128i*)ps);
1004 cache_prefetch ((__m128i*)pd);
1005 maybe_prefetch ((__m128i*)pm);
1009 __m128i xmm_src_lo, xmm_src_hi;
1010 __m128i xmm_dst_lo, xmm_dst_hi;
1012 /* fill cache line with next memory */
1013 cache_prefetch_next ((__m128i*)ps);
1014 cache_prefetch_next ((__m128i*)pd);
1015 maybe_prefetch_next ((__m128i*)pm);
1017 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1018 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1020 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1021 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1023 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1024 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1026 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1027 &xmm_src_lo, &xmm_src_hi,
1028 &xmm_dst_lo, &xmm_dst_hi);
1031 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1043 uint32_t s = combine1 (ps, pm);
1046 *pd++ = pack_1x64_32 (
1048 unpack_32_1x64 (d), negate_1x64 (
1049 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1057 static force_inline void
1058 core_combine_out_u_sse2 (uint32_t* pd,
1063 /* call prefetch hint to optimize cache load*/
1064 cache_prefetch ((__m128i*)ps);
1065 cache_prefetch ((__m128i*)pd);
1066 maybe_prefetch ((__m128i*)pm);
1068 while (w && ((unsigned long) pd & 15))
1070 uint32_t s = combine1 (ps, pm);
1073 *pd++ = pack_1x64_32 (
1075 unpack_32_1x64 (s), negate_1x64 (
1076 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1083 /* call prefetch hint to optimize cache load*/
1084 cache_prefetch ((__m128i*)ps);
1085 cache_prefetch ((__m128i*)pd);
1086 maybe_prefetch ((__m128i*)pm);
1090 __m128i xmm_src_lo, xmm_src_hi;
1091 __m128i xmm_dst_lo, xmm_dst_hi;
1093 /* fill cache line with next memory */
1094 cache_prefetch_next ((__m128i*)ps);
1095 cache_prefetch_next ((__m128i*)pd);
1096 maybe_prefetch_next ((__m128i*)pm);
1098 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1099 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1101 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1102 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1104 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1105 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1107 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1108 &xmm_dst_lo, &xmm_dst_hi,
1109 &xmm_dst_lo, &xmm_dst_hi);
1112 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1123 uint32_t s = combine1 (ps, pm);
1126 *pd++ = pack_1x64_32 (
1128 unpack_32_1x64 (s), negate_1x64 (
1129 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1137 static force_inline uint32_t
1138 core_combine_atop_u_pixel_sse2 (uint32_t src,
1141 __m64 s = unpack_32_1x64 (src);
1142 __m64 d = unpack_32_1x64 (dst);
1144 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1145 __m64 da = expand_alpha_1x64 (d);
1147 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1150 static force_inline void
1151 core_combine_atop_u_sse2 (uint32_t* pd,
1158 __m128i xmm_src_lo, xmm_src_hi;
1159 __m128i xmm_dst_lo, xmm_dst_hi;
1160 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1161 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1163 /* call prefetch hint to optimize cache load*/
1164 cache_prefetch ((__m128i*)ps);
1165 cache_prefetch ((__m128i*)pd);
1166 maybe_prefetch ((__m128i*)pm);
1168 while (w && ((unsigned long) pd & 15))
1170 s = combine1 (ps, pm);
1173 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1180 /* call prefetch hint to optimize cache load*/
1181 cache_prefetch ((__m128i*)ps);
1182 cache_prefetch ((__m128i*)pd);
1183 maybe_prefetch ((__m128i*)pm);
1187 /* fill cache line with next memory */
1188 cache_prefetch_next ((__m128i*)ps);
1189 cache_prefetch_next ((__m128i*)pd);
1190 maybe_prefetch_next ((__m128i*)pm);
1192 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1193 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1195 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1196 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1198 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1199 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1200 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1201 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1203 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1204 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1206 pix_add_multiply_2x128 (
1207 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1208 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1209 &xmm_dst_lo, &xmm_dst_hi);
1212 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1223 s = combine1 (ps, pm);
1226 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1234 static force_inline uint32_t
1235 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1238 __m64 s = unpack_32_1x64 (src);
1239 __m64 d = unpack_32_1x64 (dst);
1241 __m64 sa = expand_alpha_1x64 (s);
1242 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1244 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1247 static force_inline void
1248 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1255 __m128i xmm_src_lo, xmm_src_hi;
1256 __m128i xmm_dst_lo, xmm_dst_hi;
1257 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1258 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1260 /* call prefetch hint to optimize cache load*/
1261 cache_prefetch ((__m128i*)ps);
1262 cache_prefetch ((__m128i*)pd);
1263 maybe_prefetch ((__m128i*)pm);
1265 while (w && ((unsigned long) pd & 15))
1267 s = combine1 (ps, pm);
1270 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1277 /* call prefetch hint to optimize cache load*/
1278 cache_prefetch ((__m128i*)ps);
1279 cache_prefetch ((__m128i*)pd);
1280 maybe_prefetch ((__m128i*)pm);
1284 /* fill cache line with next memory */
1285 cache_prefetch_next ((__m128i*)ps);
1286 cache_prefetch_next ((__m128i*)pd);
1287 maybe_prefetch_next ((__m128i*)pm);
1289 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1290 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1292 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1293 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1295 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1296 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1297 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1298 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1300 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1301 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1303 pix_add_multiply_2x128 (
1304 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1305 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1306 &xmm_dst_lo, &xmm_dst_hi);
1309 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1320 s = combine1 (ps, pm);
1323 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1331 static force_inline uint32_t
1332 core_combine_xor_u_pixel_sse2 (uint32_t src,
1335 __m64 s = unpack_32_1x64 (src);
1336 __m64 d = unpack_32_1x64 (dst);
1338 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1339 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1341 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1344 static force_inline void
1345 core_combine_xor_u_sse2 (uint32_t* dst,
1346 const uint32_t* src,
1347 const uint32_t *mask,
1353 const uint32_t* ps = src;
1354 const uint32_t* pm = mask;
1356 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1357 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1358 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1359 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1361 /* call prefetch hint to optimize cache load*/
1362 cache_prefetch ((__m128i*)ps);
1363 cache_prefetch ((__m128i*)pd);
1364 maybe_prefetch ((__m128i*)pm);
1366 while (w && ((unsigned long) pd & 15))
1368 s = combine1 (ps, pm);
1371 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1378 /* call prefetch hint to optimize cache load*/
1379 cache_prefetch ((__m128i*)ps);
1380 cache_prefetch ((__m128i*)pd);
1381 maybe_prefetch ((__m128i*)pm);
1385 /* fill cache line with next memory */
1386 cache_prefetch_next ((__m128i*)ps);
1387 cache_prefetch_next ((__m128i*)pd);
1388 maybe_prefetch_next ((__m128i*)pm);
1390 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1391 xmm_dst = load_128_aligned ((__m128i*) pd);
1393 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1394 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1396 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1397 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1398 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1399 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1401 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1402 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1403 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1404 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1406 pix_add_multiply_2x128 (
1407 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1408 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1409 &xmm_dst_lo, &xmm_dst_hi);
1412 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1423 s = combine1 (ps, pm);
1426 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1434 static force_inline void
1435 core_combine_add_u_sse2 (uint32_t* dst,
1436 const uint32_t* src,
1437 const uint32_t* mask,
1443 const uint32_t* ps = src;
1444 const uint32_t* pm = mask;
1446 /* call prefetch hint to optimize cache load*/
1447 cache_prefetch ((__m128i*)ps);
1448 cache_prefetch ((__m128i*)pd);
1449 maybe_prefetch ((__m128i*)pm);
1451 while (w && (unsigned long)pd & 15)
1453 s = combine1 (ps, pm);
1459 *pd++ = _mm_cvtsi64_si32 (
1460 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1464 /* call prefetch hint to optimize cache load*/
1465 cache_prefetch ((__m128i*)ps);
1466 cache_prefetch ((__m128i*)pd);
1467 maybe_prefetch ((__m128i*)pm);
1473 /* fill cache line with next memory */
1474 cache_prefetch_next ((__m128i*)ps);
1475 cache_prefetch_next ((__m128i*)pd);
1476 maybe_prefetch_next ((__m128i*)pm);
1478 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1481 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1492 s = combine1 (ps, pm);
1496 *pd++ = _mm_cvtsi64_si32 (
1497 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1503 static force_inline uint32_t
1504 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1507 __m64 ms = unpack_32_1x64 (src);
1508 __m64 md = unpack_32_1x64 (dst);
1509 uint32_t sa = src >> 24;
1510 uint32_t da = ~dst >> 24;
1514 ms = pix_multiply_1x64 (
1515 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1518 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1521 static force_inline void
1522 core_combine_saturate_u_sse2 (uint32_t * pd,
1530 __m128i xmm_src, xmm_dst;
1532 /* call prefetch hint to optimize cache load*/
1533 cache_prefetch ((__m128i*)ps);
1534 cache_prefetch ((__m128i*)pd);
1535 maybe_prefetch ((__m128i*)pm);
1537 while (w && (unsigned long)pd & 15)
1539 s = combine1 (ps, pm);
1542 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1549 /* call prefetch hint to optimize cache load*/
1550 cache_prefetch ((__m128i*)ps);
1551 cache_prefetch ((__m128i*)pd);
1552 maybe_prefetch ((__m128i*)pm);
1556 /* fill cache line with next memory */
1557 cache_prefetch_next ((__m128i*)ps);
1558 cache_prefetch_next ((__m128i*)pd);
1559 maybe_prefetch_next ((__m128i*)pm);
1561 xmm_dst = load_128_aligned ((__m128i*)pd);
1562 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1564 pack_cmp = _mm_movemask_epi8 (
1566 _mm_srli_epi32 (xmm_src, 24),
1567 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1569 /* if some alpha src is grater than respective ~alpha dst */
1572 s = combine1 (ps++, pm);
1574 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1578 s = combine1 (ps++, pm);
1580 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1584 s = combine1 (ps++, pm);
1586 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1590 s = combine1 (ps++, pm);
1592 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1598 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1611 s = combine1 (ps, pm);
1614 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1621 static force_inline void
1622 core_combine_src_ca_sse2 (uint32_t* pd,
1629 __m128i xmm_src_lo, xmm_src_hi;
1630 __m128i xmm_mask_lo, xmm_mask_hi;
1631 __m128i xmm_dst_lo, xmm_dst_hi;
1633 /* call prefetch hint to optimize cache load*/
1634 cache_prefetch ((__m128i*)ps);
1635 cache_prefetch ((__m128i*)pd);
1636 cache_prefetch ((__m128i*)pm);
1638 while (w && (unsigned long)pd & 15)
1642 *pd++ = pack_1x64_32 (
1643 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1647 /* call prefetch hint to optimize cache load*/
1648 cache_prefetch ((__m128i*)ps);
1649 cache_prefetch ((__m128i*)pd);
1650 cache_prefetch ((__m128i*)pm);
1654 /* fill cache line with next memory */
1655 cache_prefetch_next ((__m128i*)ps);
1656 cache_prefetch_next ((__m128i*)pd);
1657 cache_prefetch_next ((__m128i*)pm);
1659 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1660 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1662 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1663 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1665 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1666 &xmm_mask_lo, &xmm_mask_hi,
1667 &xmm_dst_lo, &xmm_dst_hi);
1670 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1682 *pd++ = pack_1x64_32 (
1683 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1688 static force_inline uint32_t
1689 core_combine_over_ca_pixel_sse2 (uint32_t src,
1693 __m64 s = unpack_32_1x64 (src);
1694 __m64 expAlpha = expand_alpha_1x64 (s);
1695 __m64 unpk_mask = unpack_32_1x64 (mask);
1696 __m64 unpk_dst = unpack_32_1x64 (dst);
1698 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1701 static force_inline void
1702 core_combine_over_ca_sse2 (uint32_t* pd,
1709 __m128i xmm_alpha_lo, xmm_alpha_hi;
1710 __m128i xmm_src_lo, xmm_src_hi;
1711 __m128i xmm_dst_lo, xmm_dst_hi;
1712 __m128i xmm_mask_lo, xmm_mask_hi;
1714 /* call prefetch hint to optimize cache load*/
1715 cache_prefetch ((__m128i*)ps);
1716 cache_prefetch ((__m128i*)pd);
1717 cache_prefetch ((__m128i*)pm);
1719 while (w && (unsigned long)pd & 15)
1725 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1729 /* call prefetch hint to optimize cache load*/
1730 cache_prefetch ((__m128i*)ps);
1731 cache_prefetch ((__m128i*)pd);
1732 cache_prefetch ((__m128i*)pm);
1736 /* fill cache line with next memory */
1737 cache_prefetch_next ((__m128i*)ps);
1738 cache_prefetch_next ((__m128i*)pd);
1739 cache_prefetch_next ((__m128i*)pm);
1741 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1742 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1743 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1745 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1746 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1747 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1749 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1750 &xmm_alpha_lo, &xmm_alpha_hi);
1752 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1753 &xmm_alpha_lo, &xmm_alpha_hi,
1754 &xmm_mask_lo, &xmm_mask_hi,
1755 &xmm_dst_lo, &xmm_dst_hi);
1758 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1772 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1777 static force_inline uint32_t
1778 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1782 __m64 d = unpack_32_1x64 (dst);
1784 return pack_1x64_32 (
1785 over_1x64 (d, expand_alpha_1x64 (d),
1786 pix_multiply_1x64 (unpack_32_1x64 (src),
1787 unpack_32_1x64 (mask))));
1790 static force_inline void
1791 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1798 __m128i xmm_alpha_lo, xmm_alpha_hi;
1799 __m128i xmm_src_lo, xmm_src_hi;
1800 __m128i xmm_dst_lo, xmm_dst_hi;
1801 __m128i xmm_mask_lo, xmm_mask_hi;
1803 /* call prefetch hint to optimize cache load*/
1804 cache_prefetch ((__m128i*)ps);
1805 cache_prefetch ((__m128i*)pd);
1806 cache_prefetch ((__m128i*)pm);
1808 while (w && (unsigned long)pd & 15)
1814 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1818 /* call prefetch hint to optimize cache load*/
1819 cache_prefetch ((__m128i*)ps);
1820 cache_prefetch ((__m128i*)pd);
1821 cache_prefetch ((__m128i*)pm);
1825 /* fill cache line with next memory */
1826 cache_prefetch_next ((__m128i*)ps);
1827 cache_prefetch_next ((__m128i*)pd);
1828 cache_prefetch_next ((__m128i*)pm);
1830 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1831 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1832 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1834 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1835 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1836 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1838 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1839 &xmm_alpha_lo, &xmm_alpha_hi);
1840 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1841 &xmm_mask_lo, &xmm_mask_hi,
1842 &xmm_mask_lo, &xmm_mask_hi);
1844 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1845 &xmm_alpha_lo, &xmm_alpha_hi,
1846 &xmm_mask_lo, &xmm_mask_hi);
1849 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1863 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1868 static force_inline void
1869 core_combine_in_ca_sse2 (uint32_t * pd,
1876 __m128i xmm_alpha_lo, xmm_alpha_hi;
1877 __m128i xmm_src_lo, xmm_src_hi;
1878 __m128i xmm_dst_lo, xmm_dst_hi;
1879 __m128i xmm_mask_lo, xmm_mask_hi;
1881 /* call prefetch hint to optimize cache load*/
1882 cache_prefetch ((__m128i*)ps);
1883 cache_prefetch ((__m128i*)pd);
1884 cache_prefetch ((__m128i*)pm);
1886 while (w && (unsigned long)pd & 15)
1892 *pd++ = pack_1x64_32 (
1894 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1895 expand_alpha_1x64 (unpack_32_1x64 (d))));
1900 /* call prefetch hint to optimize cache load*/
1901 cache_prefetch ((__m128i*)ps);
1902 cache_prefetch ((__m128i*)pd);
1903 cache_prefetch ((__m128i*)pm);
1907 /* fill cache line with next memory */
1908 cache_prefetch_next ((__m128i*)ps);
1909 cache_prefetch_next ((__m128i*)pd);
1910 cache_prefetch_next ((__m128i*)pm);
1912 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1913 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1914 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1916 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1917 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1918 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1920 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1921 &xmm_alpha_lo, &xmm_alpha_hi);
1923 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1924 &xmm_mask_lo, &xmm_mask_hi,
1925 &xmm_dst_lo, &xmm_dst_hi);
1927 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1928 &xmm_alpha_lo, &xmm_alpha_hi,
1929 &xmm_dst_lo, &xmm_dst_hi);
1932 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1946 *pd++ = pack_1x64_32 (
1949 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1950 expand_alpha_1x64 (unpack_32_1x64 (d))));
1956 static force_inline void
1957 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1964 __m128i xmm_alpha_lo, xmm_alpha_hi;
1965 __m128i xmm_src_lo, xmm_src_hi;
1966 __m128i xmm_dst_lo, xmm_dst_hi;
1967 __m128i xmm_mask_lo, xmm_mask_hi;
1969 /* call prefetch hint to optimize cache load*/
1970 cache_prefetch ((__m128i*)ps);
1971 cache_prefetch ((__m128i*)pd);
1972 cache_prefetch ((__m128i*)pm);
1974 while (w && (unsigned long)pd & 15)
1980 *pd++ = pack_1x64_32 (
1983 pix_multiply_1x64 (unpack_32_1x64 (m),
1984 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1988 /* call prefetch hint to optimize cache load*/
1989 cache_prefetch ((__m128i*)ps);
1990 cache_prefetch ((__m128i*)pd);
1991 cache_prefetch ((__m128i*)pm);
1995 /* fill cache line with next memory */
1996 cache_prefetch_next ((__m128i*)ps);
1997 cache_prefetch_next ((__m128i*)pd);
1998 cache_prefetch_next ((__m128i*)pm);
2000 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2001 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2002 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2004 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2005 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2006 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2008 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2009 &xmm_alpha_lo, &xmm_alpha_hi);
2010 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2011 &xmm_alpha_lo, &xmm_alpha_hi,
2012 &xmm_alpha_lo, &xmm_alpha_hi);
2014 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2015 &xmm_alpha_lo, &xmm_alpha_hi,
2016 &xmm_dst_lo, &xmm_dst_hi);
2019 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2033 *pd++ = pack_1x64_32 (
2036 pix_multiply_1x64 (unpack_32_1x64 (m),
2037 expand_alpha_1x64 (unpack_32_1x64 (s)))));
2042 static force_inline void
2043 core_combine_out_ca_sse2 (uint32_t * pd,
2050 __m128i xmm_alpha_lo, xmm_alpha_hi;
2051 __m128i xmm_src_lo, xmm_src_hi;
2052 __m128i xmm_dst_lo, xmm_dst_hi;
2053 __m128i xmm_mask_lo, xmm_mask_hi;
2055 /* call prefetch hint to optimize cache load*/
2056 cache_prefetch ((__m128i*)ps);
2057 cache_prefetch ((__m128i*)pd);
2058 cache_prefetch ((__m128i*)pm);
2060 while (w && (unsigned long)pd & 15)
2066 *pd++ = pack_1x64_32 (
2069 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2070 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2074 /* call prefetch hint to optimize cache load*/
2075 cache_prefetch ((__m128i*)ps);
2076 cache_prefetch ((__m128i*)pd);
2077 cache_prefetch ((__m128i*)pm);
2081 /* fill cache line with next memory */
2082 cache_prefetch_next ((__m128i*)ps);
2083 cache_prefetch_next ((__m128i*)pd);
2084 cache_prefetch_next ((__m128i*)pm);
2086 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2087 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2088 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2090 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2091 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2092 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2094 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2095 &xmm_alpha_lo, &xmm_alpha_hi);
2096 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2097 &xmm_alpha_lo, &xmm_alpha_hi);
2099 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2100 &xmm_mask_lo, &xmm_mask_hi,
2101 &xmm_dst_lo, &xmm_dst_hi);
2102 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2103 &xmm_alpha_lo, &xmm_alpha_hi,
2104 &xmm_dst_lo, &xmm_dst_hi);
2107 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2121 *pd++ = pack_1x64_32 (
2124 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2125 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2131 static force_inline void
2132 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
2139 __m128i xmm_alpha_lo, xmm_alpha_hi;
2140 __m128i xmm_src_lo, xmm_src_hi;
2141 __m128i xmm_dst_lo, xmm_dst_hi;
2142 __m128i xmm_mask_lo, xmm_mask_hi;
2144 /* call prefetch hint to optimize cache load*/
2145 cache_prefetch ((__m128i*)ps);
2146 cache_prefetch ((__m128i*)pd);
2147 cache_prefetch ((__m128i*)pm);
2149 while (w && (unsigned long)pd & 15)
2155 *pd++ = pack_1x64_32 (
2158 negate_1x64 (pix_multiply_1x64 (
2160 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2164 /* call prefetch hint to optimize cache load*/
2165 cache_prefetch ((__m128i*)ps);
2166 cache_prefetch ((__m128i*)pd);
2167 cache_prefetch ((__m128i*)pm);
2171 /* fill cache line with next memory */
2172 cache_prefetch_next ((__m128i*)ps);
2173 cache_prefetch_next ((__m128i*)pd);
2174 cache_prefetch_next ((__m128i*)pm);
2176 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2177 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2178 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2180 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2181 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2182 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2184 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2185 &xmm_alpha_lo, &xmm_alpha_hi);
2187 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2188 &xmm_alpha_lo, &xmm_alpha_hi,
2189 &xmm_mask_lo, &xmm_mask_hi);
2191 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2192 &xmm_mask_lo, &xmm_mask_hi);
2194 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2195 &xmm_mask_lo, &xmm_mask_hi,
2196 &xmm_dst_lo, &xmm_dst_hi);
2199 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2213 *pd++ = pack_1x64_32 (
2216 negate_1x64 (pix_multiply_1x64 (
2218 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2223 static force_inline uint32_t
2224 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2228 __m64 m = unpack_32_1x64 (mask);
2229 __m64 s = unpack_32_1x64 (src);
2230 __m64 d = unpack_32_1x64 (dst);
2231 __m64 sa = expand_alpha_1x64 (s);
2232 __m64 da = expand_alpha_1x64 (d);
2234 s = pix_multiply_1x64 (s, m);
2235 m = negate_1x64 (pix_multiply_1x64 (m, sa));
2237 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2240 static force_inline void
2241 core_combine_atop_ca_sse2 (uint32_t * pd,
2248 __m128i xmm_src_lo, xmm_src_hi;
2249 __m128i xmm_dst_lo, xmm_dst_hi;
2250 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2251 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2252 __m128i xmm_mask_lo, xmm_mask_hi;
2254 /* call prefetch hint to optimize cache load*/
2255 cache_prefetch ((__m128i*)ps);
2256 cache_prefetch ((__m128i*)pd);
2257 cache_prefetch ((__m128i*)pm);
2259 while (w && (unsigned long)pd & 15)
2265 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2269 /* call prefetch hint to optimize cache load*/
2270 cache_prefetch ((__m128i*)ps);
2271 cache_prefetch ((__m128i*)pd);
2272 cache_prefetch ((__m128i*)pm);
2276 /* fill cache line with next memory */
2277 cache_prefetch_next ((__m128i*)ps);
2278 cache_prefetch_next ((__m128i*)pd);
2279 cache_prefetch_next ((__m128i*)pm);
2281 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2282 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2283 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2285 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2286 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2287 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2289 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2290 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2291 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2292 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2294 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2295 &xmm_mask_lo, &xmm_mask_hi,
2296 &xmm_src_lo, &xmm_src_hi);
2297 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2298 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2299 &xmm_mask_lo, &xmm_mask_hi);
2301 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2303 pix_add_multiply_2x128 (
2304 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2305 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2306 &xmm_dst_lo, &xmm_dst_hi);
2309 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2323 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2328 static force_inline uint32_t
2329 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2333 __m64 m = unpack_32_1x64 (mask);
2334 __m64 s = unpack_32_1x64 (src);
2335 __m64 d = unpack_32_1x64 (dst);
2337 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2338 __m64 sa = expand_alpha_1x64 (s);
2340 s = pix_multiply_1x64 (s, m);
2341 m = pix_multiply_1x64 (m, sa);
2343 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2346 static force_inline void
2347 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2354 __m128i xmm_src_lo, xmm_src_hi;
2355 __m128i xmm_dst_lo, xmm_dst_hi;
2356 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2357 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2358 __m128i xmm_mask_lo, xmm_mask_hi;
2360 /* call prefetch hint to optimize cache load*/
2361 cache_prefetch ((__m128i*)ps);
2362 cache_prefetch ((__m128i*)pd);
2363 cache_prefetch ((__m128i*)pm);
2365 while (w && (unsigned long)pd & 15)
2371 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2375 /* call prefetch hint to optimize cache load*/
2376 cache_prefetch ((__m128i*)ps);
2377 cache_prefetch ((__m128i*)pd);
2378 cache_prefetch ((__m128i*)pm);
2382 /* fill cache line with next memory */
2383 cache_prefetch_next ((__m128i*)ps);
2384 cache_prefetch_next ((__m128i*)pd);
2385 cache_prefetch_next ((__m128i*)pm);
2387 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2388 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2389 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2391 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2392 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2393 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2395 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2396 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2397 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2398 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2400 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2401 &xmm_mask_lo, &xmm_mask_hi,
2402 &xmm_src_lo, &xmm_src_hi);
2403 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2404 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2405 &xmm_mask_lo, &xmm_mask_hi);
2407 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2408 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2410 pix_add_multiply_2x128 (
2411 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2412 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2413 &xmm_dst_lo, &xmm_dst_hi);
2416 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2430 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2435 static force_inline uint32_t
2436 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2440 __m64 a = unpack_32_1x64 (mask);
2441 __m64 s = unpack_32_1x64 (src);
2442 __m64 d = unpack_32_1x64 (dst);
2444 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2445 a, expand_alpha_1x64 (s)));
2446 __m64 dest = pix_multiply_1x64 (s, a);
2447 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2449 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2455 static force_inline void
2456 core_combine_xor_ca_sse2 (uint32_t * pd,
2463 __m128i xmm_src_lo, xmm_src_hi;
2464 __m128i xmm_dst_lo, xmm_dst_hi;
2465 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2466 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2467 __m128i xmm_mask_lo, xmm_mask_hi;
2469 /* call prefetch hint to optimize cache load*/
2470 cache_prefetch ((__m128i*)ps);
2471 cache_prefetch ((__m128i*)pd);
2472 cache_prefetch ((__m128i*)pm);
2474 while (w && (unsigned long)pd & 15)
2480 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2484 /* call prefetch hint to optimize cache load*/
2485 cache_prefetch ((__m128i*)ps);
2486 cache_prefetch ((__m128i*)pd);
2487 cache_prefetch ((__m128i*)pm);
2491 /* fill cache line with next memory */
2492 cache_prefetch_next ((__m128i*)ps);
2493 cache_prefetch_next ((__m128i*)pd);
2494 cache_prefetch_next ((__m128i*)pm);
2496 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2497 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2498 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2500 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2501 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2502 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2504 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2505 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2506 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2507 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2509 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2510 &xmm_mask_lo, &xmm_mask_hi,
2511 &xmm_src_lo, &xmm_src_hi);
2512 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2513 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2514 &xmm_mask_lo, &xmm_mask_hi);
2516 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2517 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2518 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2519 &xmm_mask_lo, &xmm_mask_hi);
2521 pix_add_multiply_2x128 (
2522 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2523 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2524 &xmm_dst_lo, &xmm_dst_hi);
2527 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2541 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2546 static force_inline void
2547 core_combine_add_ca_sse2 (uint32_t * pd,
2554 __m128i xmm_src_lo, xmm_src_hi;
2555 __m128i xmm_dst_lo, xmm_dst_hi;
2556 __m128i xmm_mask_lo, xmm_mask_hi;
2558 /* call prefetch hint to optimize cache load*/
2559 cache_prefetch ((__m128i*)ps);
2560 cache_prefetch ((__m128i*)pd);
2561 cache_prefetch ((__m128i*)pm);
2563 while (w && (unsigned long)pd & 15)
2569 *pd++ = pack_1x64_32 (
2570 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2571 unpack_32_1x64 (m)),
2572 unpack_32_1x64 (d)));
2576 /* call prefetch hint to optimize cache load*/
2577 cache_prefetch ((__m128i*)ps);
2578 cache_prefetch ((__m128i*)pd);
2579 cache_prefetch ((__m128i*)pm);
2583 /* fill cache line with next memory */
2584 cache_prefetch_next ((__m128i*)ps);
2585 cache_prefetch_next ((__m128i*)pd);
2586 cache_prefetch_next ((__m128i*)pm);
2588 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2589 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2590 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2592 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2593 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2594 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2596 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2597 &xmm_mask_lo, &xmm_mask_hi,
2598 &xmm_src_lo, &xmm_src_hi);
2601 (__m128i*)pd, pack_2x128_128 (
2602 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2603 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2617 *pd++ = pack_1x64_32 (
2618 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2619 unpack_32_1x64 (m)),
2620 unpack_32_1x64 (d)));
2625 /* ---------------------------------------------------
2626 * fb_compose_setup_sSE2
2628 static force_inline __m64
2629 create_mask_16_64 (uint16_t mask)
2631 return _mm_set1_pi16 (mask);
2634 static force_inline __m128i
2635 create_mask_16_128 (uint16_t mask)
2637 return _mm_set1_epi16 (mask);
2640 static force_inline __m64
2641 create_mask_2x32_64 (uint32_t mask0,
2644 return _mm_set_pi32 (mask0, mask1);
2647 /* Work around a code generation bug in Sun Studio 12. */
2648 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2649 # define create_mask_2x32_128(mask0, mask1) \
2650 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2652 static force_inline __m128i
2653 create_mask_2x32_128 (uint32_t mask0,
2656 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2660 /* SSE2 code patch for fbcompose.c */
2663 sse2_combine_over_u (pixman_implementation_t *imp,
2666 const uint32_t * src,
2667 const uint32_t * mask,
2670 core_combine_over_u_sse2 (dst, src, mask, width);
2675 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2678 const uint32_t * src,
2679 const uint32_t * mask,
2682 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2687 sse2_combine_in_u (pixman_implementation_t *imp,
2690 const uint32_t * src,
2691 const uint32_t * mask,
2694 core_combine_in_u_sse2 (dst, src, mask, width);
2699 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2702 const uint32_t * src,
2703 const uint32_t * mask,
2706 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2711 sse2_combine_out_u (pixman_implementation_t *imp,
2714 const uint32_t * src,
2715 const uint32_t * mask,
2718 core_combine_out_u_sse2 (dst, src, mask, width);
2723 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2726 const uint32_t * src,
2727 const uint32_t * mask,
2730 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2735 sse2_combine_atop_u (pixman_implementation_t *imp,
2738 const uint32_t * src,
2739 const uint32_t * mask,
2742 core_combine_atop_u_sse2 (dst, src, mask, width);
2747 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2750 const uint32_t * src,
2751 const uint32_t * mask,
2754 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2759 sse2_combine_xor_u (pixman_implementation_t *imp,
2762 const uint32_t * src,
2763 const uint32_t * mask,
2766 core_combine_xor_u_sse2 (dst, src, mask, width);
2771 sse2_combine_add_u (pixman_implementation_t *imp,
2774 const uint32_t * src,
2775 const uint32_t * mask,
2778 core_combine_add_u_sse2 (dst, src, mask, width);
2783 sse2_combine_saturate_u (pixman_implementation_t *imp,
2786 const uint32_t * src,
2787 const uint32_t * mask,
2790 core_combine_saturate_u_sse2 (dst, src, mask, width);
2795 sse2_combine_src_ca (pixman_implementation_t *imp,
2798 const uint32_t * src,
2799 const uint32_t * mask,
2802 core_combine_src_ca_sse2 (dst, src, mask, width);
2807 sse2_combine_over_ca (pixman_implementation_t *imp,
2810 const uint32_t * src,
2811 const uint32_t * mask,
2814 core_combine_over_ca_sse2 (dst, src, mask, width);
2819 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2822 const uint32_t * src,
2823 const uint32_t * mask,
2826 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2831 sse2_combine_in_ca (pixman_implementation_t *imp,
2834 const uint32_t * src,
2835 const uint32_t * mask,
2838 core_combine_in_ca_sse2 (dst, src, mask, width);
2843 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2846 const uint32_t * src,
2847 const uint32_t * mask,
2850 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2855 sse2_combine_out_ca (pixman_implementation_t *imp,
2858 const uint32_t * src,
2859 const uint32_t * mask,
2862 core_combine_out_ca_sse2 (dst, src, mask, width);
2867 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2870 const uint32_t * src,
2871 const uint32_t * mask,
2874 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2879 sse2_combine_atop_ca (pixman_implementation_t *imp,
2882 const uint32_t * src,
2883 const uint32_t * mask,
2886 core_combine_atop_ca_sse2 (dst, src, mask, width);
2891 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2894 const uint32_t * src,
2895 const uint32_t * mask,
2898 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2903 sse2_combine_xor_ca (pixman_implementation_t *imp,
2906 const uint32_t * src,
2907 const uint32_t * mask,
2910 core_combine_xor_ca_sse2 (dst, src, mask, width);
2915 sse2_combine_add_ca (pixman_implementation_t *imp,
2918 const uint32_t * src,
2919 const uint32_t * mask,
2922 core_combine_add_ca_sse2 (dst, src, mask, width);
2926 /* -------------------------------------------------------------------
2927 * composite_over_n_8888
2931 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2933 pixman_image_t * src_image,
2934 pixman_image_t * mask_image,
2935 pixman_image_t * dst_image,
2946 uint32_t *dst_line, *dst, d;
2949 __m128i xmm_src, xmm_alpha;
2950 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2952 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2957 PIXMAN_IMAGE_GET_LINE (
2958 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2960 xmm_src = expand_pixel_32_1x128 (src);
2961 xmm_alpha = expand_alpha_1x128 (xmm_src);
2967 /* call prefetch hint to optimize cache load*/
2968 cache_prefetch ((__m128i*)dst);
2970 dst_line += dst_stride;
2973 while (w && (unsigned long)dst & 15)
2976 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2977 _mm_movepi64_pi64 (xmm_alpha),
2978 unpack_32_1x64 (d)));
2982 cache_prefetch ((__m128i*)dst);
2986 /* fill cache line with next memory */
2987 cache_prefetch_next ((__m128i*)dst);
2989 xmm_dst = load_128_aligned ((__m128i*)dst);
2991 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2993 over_2x128 (&xmm_src, &xmm_src,
2994 &xmm_alpha, &xmm_alpha,
2995 &xmm_dst_lo, &xmm_dst_hi);
2997 /* rebuid the 4 pixel data and save*/
2999 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3008 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3009 _mm_movepi64_pi64 (xmm_alpha),
3010 unpack_32_1x64 (d)));
3018 /* ---------------------------------------------------------------------
3019 * composite_over_n_0565
3022 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3024 pixman_image_t * src_image,
3025 pixman_image_t * mask_image,
3026 pixman_image_t * dst_image,
3037 uint16_t *dst_line, *dst, d;
3040 __m128i xmm_src, xmm_alpha;
3041 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3043 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3048 PIXMAN_IMAGE_GET_LINE (
3049 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3051 xmm_src = expand_pixel_32_1x128 (src);
3052 xmm_alpha = expand_alpha_1x128 (xmm_src);
3058 /* call prefetch hint to optimize cache load*/
3059 cache_prefetch ((__m128i*)dst);
3061 dst_line += dst_stride;
3064 while (w && (unsigned long)dst & 15)
3068 *dst++ = pack_565_32_16 (
3069 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3070 _mm_movepi64_pi64 (xmm_alpha),
3071 expand565_16_1x64 (d))));
3075 /* call prefetch hint to optimize cache load*/
3076 cache_prefetch ((__m128i*)dst);
3080 /* fill cache line with next memory */
3081 cache_prefetch_next ((__m128i*)dst);
3083 xmm_dst = load_128_aligned ((__m128i*)dst);
3085 unpack_565_128_4x128 (xmm_dst,
3086 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3088 over_2x128 (&xmm_src, &xmm_src,
3089 &xmm_alpha, &xmm_alpha,
3090 &xmm_dst0, &xmm_dst1);
3091 over_2x128 (&xmm_src, &xmm_src,
3092 &xmm_alpha, &xmm_alpha,
3093 &xmm_dst2, &xmm_dst3);
3095 xmm_dst = pack_565_4x128_128 (
3096 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3098 save_128_aligned ((__m128i*)dst, xmm_dst);
3107 *dst++ = pack_565_32_16 (
3108 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3109 _mm_movepi64_pi64 (xmm_alpha),
3110 expand565_16_1x64 (d))));
3117 /* ------------------------------
3118 * composite_add_n_8888_8888_ca
3121 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3123 pixman_image_t * src_image,
3124 pixman_image_t * mask_image,
3125 pixman_image_t * dst_image,
3136 uint32_t *dst_line, d;
3137 uint32_t *mask_line, m;
3139 int dst_stride, mask_stride;
3141 __m128i xmm_src, xmm_alpha;
3143 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3145 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3147 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3153 PIXMAN_IMAGE_GET_LINE (
3154 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3155 PIXMAN_IMAGE_GET_LINE (
3156 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3158 xmm_src = _mm_unpacklo_epi8 (
3159 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3160 xmm_alpha = expand_alpha_1x128 (xmm_src);
3161 mmx_src = _mm_movepi64_pi64 (xmm_src);
3162 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3167 const uint32_t *pm = (uint32_t *)mask_line;
3168 uint32_t *pd = (uint32_t *)dst_line;
3170 dst_line += dst_stride;
3171 mask_line += mask_stride;
3173 /* call prefetch hint to optimize cache load*/
3174 cache_prefetch ((__m128i*)pd);
3175 cache_prefetch ((__m128i*)pm);
3177 while (w && (unsigned long)pd & 15)
3185 mmx_mask = unpack_32_1x64 (m);
3186 mmx_dest = unpack_32_1x64 (d);
3188 *pd = pack_1x64_32 (
3189 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3196 /* call prefetch hint to optimize cache load*/
3197 cache_prefetch ((__m128i*)pd);
3198 cache_prefetch ((__m128i*)pm);
3202 /* fill cache line with next memory */
3203 cache_prefetch_next ((__m128i*)pd);
3204 cache_prefetch_next ((__m128i*)pm);
3206 xmm_mask = load_128_unaligned ((__m128i*)pm);
3210 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3212 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3213 if (pack_cmp != 0xffff)
3215 xmm_dst = load_128_aligned ((__m128i*)pd);
3217 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3219 pix_multiply_2x128 (&xmm_src, &xmm_src,
3220 &xmm_mask_lo, &xmm_mask_hi,
3221 &xmm_mask_lo, &xmm_mask_hi);
3222 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3225 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3241 mmx_mask = unpack_32_1x64 (m);
3242 mmx_dest = unpack_32_1x64 (d);
3244 *pd = pack_1x64_32 (
3245 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3256 /* ---------------------------------------------------------------------------
3257 * composite_over_n_8888_8888_ca
3261 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3263 pixman_image_t * src_image,
3264 pixman_image_t * mask_image,
3265 pixman_image_t * dst_image,
3276 uint32_t *dst_line, d;
3277 uint32_t *mask_line, m;
3279 int dst_stride, mask_stride;
3281 __m128i xmm_src, xmm_alpha;
3282 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3283 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3285 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3287 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3292 PIXMAN_IMAGE_GET_LINE (
3293 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3294 PIXMAN_IMAGE_GET_LINE (
3295 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3297 xmm_src = _mm_unpacklo_epi8 (
3298 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3299 xmm_alpha = expand_alpha_1x128 (xmm_src);
3300 mmx_src = _mm_movepi64_pi64 (xmm_src);
3301 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3306 const uint32_t *pm = (uint32_t *)mask_line;
3307 uint32_t *pd = (uint32_t *)dst_line;
3309 dst_line += dst_stride;
3310 mask_line += mask_stride;
3312 /* call prefetch hint to optimize cache load*/
3313 cache_prefetch ((__m128i*)pd);
3314 cache_prefetch ((__m128i*)pm);
3316 while (w && (unsigned long)pd & 15)
3323 mmx_mask = unpack_32_1x64 (m);
3324 mmx_dest = unpack_32_1x64 (d);
3326 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3336 /* call prefetch hint to optimize cache load*/
3337 cache_prefetch ((__m128i*)pd);
3338 cache_prefetch ((__m128i*)pm);
3342 /* fill cache line with next memory */
3343 cache_prefetch_next ((__m128i*)pd);
3344 cache_prefetch_next ((__m128i*)pm);
3346 xmm_mask = load_128_unaligned ((__m128i*)pm);
3350 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3352 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3353 if (pack_cmp != 0xffff)
3355 xmm_dst = load_128_aligned ((__m128i*)pd);
3357 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3358 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3360 in_over_2x128 (&xmm_src, &xmm_src,
3361 &xmm_alpha, &xmm_alpha,
3362 &xmm_mask_lo, &xmm_mask_hi,
3363 &xmm_dst_lo, &xmm_dst_hi);
3366 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3381 mmx_mask = unpack_32_1x64 (m);
3382 mmx_dest = unpack_32_1x64 (d);
3384 *pd = pack_1x64_32 (
3385 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3396 /*---------------------------------------------------------------------
3397 * composite_over_8888_n_8888
3401 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3403 pixman_image_t * src_image,
3404 pixman_image_t * mask_image,
3405 pixman_image_t * dst_image,
3415 uint32_t *dst_line, *dst;
3416 uint32_t *src_line, *src;
3419 int dst_stride, src_stride;
3422 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3423 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3424 __m128i xmm_alpha_lo, xmm_alpha_hi;
3426 PIXMAN_IMAGE_GET_LINE (
3427 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3428 PIXMAN_IMAGE_GET_LINE (
3429 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3431 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3433 xmm_mask = create_mask_16_128 (mask >> 24);
3438 dst_line += dst_stride;
3440 src_line += src_stride;
3443 /* call prefetch hint to optimize cache load*/
3444 cache_prefetch ((__m128i*)dst);
3445 cache_prefetch ((__m128i*)src);
3447 while (w && (unsigned long)dst & 15)
3449 uint32_t s = *src++;
3452 __m64 ms = unpack_32_1x64 (s);
3453 __m64 alpha = expand_alpha_1x64 (ms);
3454 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3455 __m64 alpha_dst = unpack_32_1x64 (d);
3457 *dst++ = pack_1x64_32 (
3458 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3463 /* call prefetch hint to optimize cache load*/
3464 cache_prefetch ((__m128i*)dst);
3465 cache_prefetch ((__m128i*)src);
3469 /* fill cache line with next memory */
3470 cache_prefetch_next ((__m128i*)dst);
3471 cache_prefetch_next ((__m128i*)src);
3473 xmm_src = load_128_unaligned ((__m128i*)src);
3474 xmm_dst = load_128_aligned ((__m128i*)dst);
3476 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3477 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3478 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3479 &xmm_alpha_lo, &xmm_alpha_hi);
3481 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3482 &xmm_alpha_lo, &xmm_alpha_hi,
3483 &xmm_mask, &xmm_mask,
3484 &xmm_dst_lo, &xmm_dst_hi);
3487 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3496 uint32_t s = *src++;
3499 __m64 ms = unpack_32_1x64 (s);
3500 __m64 alpha = expand_alpha_1x64 (ms);
3501 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3502 __m64 dest = unpack_32_1x64 (d);
3504 *dst++ = pack_1x64_32 (
3505 in_over_1x64 (&ms, &alpha, &mask, &dest));
3514 /* ---------------------------------------------------------------------
3515 * composite_over_x888_n_8888
3518 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3520 pixman_image_t * src_image,
3521 pixman_image_t * mask_image,
3522 pixman_image_t * dst_image,
3532 uint32_t *dst_line, *dst;
3533 uint32_t *src_line, *src;
3535 int dst_stride, src_stride;
3538 __m128i xmm_mask, xmm_alpha;
3539 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3540 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3542 PIXMAN_IMAGE_GET_LINE (
3543 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3544 PIXMAN_IMAGE_GET_LINE (
3545 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3547 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3549 xmm_mask = create_mask_16_128 (mask >> 24);
3550 xmm_alpha = mask_00ff;
3555 dst_line += dst_stride;
3557 src_line += src_stride;
3560 /* call prefetch hint to optimize cache load*/
3561 cache_prefetch ((__m128i*)dst);
3562 cache_prefetch ((__m128i*)src);
3564 while (w && (unsigned long)dst & 15)
3566 uint32_t s = (*src++) | 0xff000000;
3569 __m64 src = unpack_32_1x64 (s);
3570 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3571 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3572 __m64 dest = unpack_32_1x64 (d);
3574 *dst++ = pack_1x64_32 (
3575 in_over_1x64 (&src, &alpha, &mask, &dest));
3580 /* call prefetch hint to optimize cache load*/
3581 cache_prefetch ((__m128i*)dst);
3582 cache_prefetch ((__m128i*)src);
3586 /* fill cache line with next memory */
3587 cache_prefetch_next ((__m128i*)dst);
3588 cache_prefetch_next ((__m128i*)src);
3590 xmm_src = _mm_or_si128 (
3591 load_128_unaligned ((__m128i*)src), mask_ff000000);
3592 xmm_dst = load_128_aligned ((__m128i*)dst);
3594 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3595 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3597 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3598 &xmm_alpha, &xmm_alpha,
3599 &xmm_mask, &xmm_mask,
3600 &xmm_dst_lo, &xmm_dst_hi);
3603 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3613 uint32_t s = (*src++) | 0xff000000;
3616 __m64 src = unpack_32_1x64 (s);
3617 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3618 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3619 __m64 dest = unpack_32_1x64 (d);
3621 *dst++ = pack_1x64_32 (
3622 in_over_1x64 (&src, &alpha, &mask, &dest));
3631 /* --------------------------------------------------------------------
3632 * composite_over_8888_8888
3635 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3637 pixman_image_t * src_image,
3638 pixman_image_t * mask_image,
3639 pixman_image_t * dst_image,
3649 int dst_stride, src_stride;
3650 uint32_t *dst_line, *dst;
3651 uint32_t *src_line, *src;
3653 PIXMAN_IMAGE_GET_LINE (
3654 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3655 PIXMAN_IMAGE_GET_LINE (
3656 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3663 core_combine_over_u_sse2 (dst, src, NULL, width);
3671 /* ------------------------------------------------------------------
3672 * composite_over_8888_0565
3674 static force_inline uint16_t
3675 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3679 ms = unpack_32_1x64 (src);
3680 return pack_565_32_16 (
3683 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3687 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3689 pixman_image_t * src_image,
3690 pixman_image_t * mask_image,
3691 pixman_image_t * dst_image,
3701 uint16_t *dst_line, *dst, d;
3702 uint32_t *src_line, *src, s;
3703 int dst_stride, src_stride;
3706 __m128i xmm_alpha_lo, xmm_alpha_hi;
3707 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3708 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3710 PIXMAN_IMAGE_GET_LINE (
3711 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3712 PIXMAN_IMAGE_GET_LINE (
3713 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3718 * I copy the code from MMX one and keep the fixme.
3719 * If it's a problem there, probably is a problem here.
3721 assert (src_image->drawable == mask_image->drawable);
3729 /* call prefetch hint to optimize cache load*/
3730 cache_prefetch ((__m128i*)src);
3731 cache_prefetch ((__m128i*)dst);
3733 dst_line += dst_stride;
3734 src_line += src_stride;
3737 /* Align dst on a 16-byte boundary */
3739 ((unsigned long)dst & 15))
3744 *dst++ = composite_over_8888_0565pixel (s, d);
3748 /* call prefetch hint to optimize cache load*/
3749 cache_prefetch ((__m128i*)src);
3750 cache_prefetch ((__m128i*)dst);
3752 /* It's a 8 pixel loop */
3755 /* fill cache line with next memory */
3756 cache_prefetch_next ((__m128i*)src);
3757 cache_prefetch_next ((__m128i*)dst);
3759 /* I'm loading unaligned because I'm not sure
3760 * about the address alignment.
3762 xmm_src = load_128_unaligned ((__m128i*) src);
3763 xmm_dst = load_128_aligned ((__m128i*) dst);
3766 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3767 unpack_565_128_4x128 (xmm_dst,
3768 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3769 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3770 &xmm_alpha_lo, &xmm_alpha_hi);
3772 /* I'm loading next 4 pixels from memory
3773 * before to optimze the memory read.
3775 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3777 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3778 &xmm_alpha_lo, &xmm_alpha_hi,
3779 &xmm_dst0, &xmm_dst1);
3782 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3783 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3784 &xmm_alpha_lo, &xmm_alpha_hi);
3786 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3787 &xmm_alpha_lo, &xmm_alpha_hi,
3788 &xmm_dst2, &xmm_dst3);
3791 (__m128i*)dst, pack_565_4x128_128 (
3792 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3804 *dst++ = composite_over_8888_0565pixel (s, d);
3811 /* -----------------------------------------------------------------
3812 * composite_over_n_8_8888
3816 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3818 pixman_image_t * src_image,
3819 pixman_image_t * mask_image,
3820 pixman_image_t * dst_image,
3831 uint32_t *dst_line, *dst;
3832 uint8_t *mask_line, *mask;
3833 int dst_stride, mask_stride;
3837 __m128i xmm_src, xmm_alpha, xmm_def;
3838 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3839 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3841 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3843 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3849 PIXMAN_IMAGE_GET_LINE (
3850 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3851 PIXMAN_IMAGE_GET_LINE (
3852 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3854 xmm_def = create_mask_2x32_128 (src, src);
3855 xmm_src = expand_pixel_32_1x128 (src);
3856 xmm_alpha = expand_alpha_1x128 (xmm_src);
3857 mmx_src = _mm_movepi64_pi64 (xmm_src);
3858 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3863 dst_line += dst_stride;
3865 mask_line += mask_stride;
3868 /* call prefetch hint to optimize cache load*/
3869 cache_prefetch ((__m128i*)mask);
3870 cache_prefetch ((__m128i*)dst);
3872 while (w && (unsigned long)dst & 15)
3874 uint8_t m = *mask++;
3879 mmx_mask = expand_pixel_8_1x64 (m);
3880 mmx_dest = unpack_32_1x64 (d);
3882 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3892 /* call prefetch hint to optimize cache load*/
3893 cache_prefetch ((__m128i*)mask);
3894 cache_prefetch ((__m128i*)dst);
3898 /* fill cache line with next memory */
3899 cache_prefetch_next ((__m128i*)mask);
3900 cache_prefetch_next ((__m128i*)dst);
3902 m = *((uint32_t*)mask);
3904 if (srca == 0xff && m == 0xffffffff)
3906 save_128_aligned ((__m128i*)dst, xmm_def);
3910 xmm_dst = load_128_aligned ((__m128i*) dst);
3911 xmm_mask = unpack_32_1x128 (m);
3912 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3915 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3916 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3918 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3919 &xmm_mask_lo, &xmm_mask_hi);
3921 in_over_2x128 (&xmm_src, &xmm_src,
3922 &xmm_alpha, &xmm_alpha,
3923 &xmm_mask_lo, &xmm_mask_hi,
3924 &xmm_dst_lo, &xmm_dst_hi);
3927 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3937 uint8_t m = *mask++;
3942 mmx_mask = expand_pixel_8_1x64 (m);
3943 mmx_dest = unpack_32_1x64 (d);
3945 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3959 /* ----------------------------------------------------------------
3960 * composite_over_n_8_8888
3964 pixman_fill_sse2 (uint32_t *bits,
3973 uint32_t byte_width;
3978 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3981 if (bpp != 16 && bpp != 32)
3986 stride = stride * (int) sizeof (uint32_t) / 2;
3987 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3988 byte_width = 2 * width;
3993 stride = stride * (int) sizeof (uint32_t) / 4;
3994 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3995 byte_width = 4 * width;
3999 cache_prefetch ((__m128i*)byte_line);
4000 xmm_def = create_mask_2x32_128 (data, data);
4005 uint8_t *d = byte_line;
4006 byte_line += stride;
4010 cache_prefetch_next ((__m128i*)d);
4012 while (w >= 2 && ((unsigned long)d & 3))
4014 *(uint16_t *)d = data;
4019 while (w >= 4 && ((unsigned long)d & 15))
4021 *(uint32_t *)d = data;
4027 cache_prefetch_next ((__m128i*)d);
4031 cache_prefetch (((__m128i*)d) + 12);
4033 save_128_aligned ((__m128i*)(d), xmm_def);
4034 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4035 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4036 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4037 save_128_aligned ((__m128i*)(d + 64), xmm_def);
4038 save_128_aligned ((__m128i*)(d + 80), xmm_def);
4039 save_128_aligned ((__m128i*)(d + 96), xmm_def);
4040 save_128_aligned ((__m128i*)(d + 112), xmm_def);
4048 cache_prefetch (((__m128i*)d) + 8);
4050 save_128_aligned ((__m128i*)(d), xmm_def);
4051 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4052 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4053 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4059 cache_prefetch_next ((__m128i*)d);
4063 save_128_aligned ((__m128i*)(d), xmm_def);
4064 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4072 save_128_aligned ((__m128i*)(d), xmm_def);
4078 cache_prefetch_next ((__m128i*)d);
4082 *(uint32_t *)d = data;
4090 *(uint16_t *)d = data;
4101 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4103 pixman_image_t * src_image,
4104 pixman_image_t * mask_image,
4105 pixman_image_t * dst_image,
4116 uint32_t *dst_line, *dst;
4117 uint8_t *mask_line, *mask;
4118 int dst_stride, mask_stride;
4122 __m128i xmm_src, xmm_def;
4123 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4125 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4130 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4131 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4132 dest_x, dest_y, width, height, 0);
4136 PIXMAN_IMAGE_GET_LINE (
4137 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4138 PIXMAN_IMAGE_GET_LINE (
4139 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4141 xmm_def = create_mask_2x32_128 (src, src);
4142 xmm_src = expand_pixel_32_1x128 (src);
4147 dst_line += dst_stride;
4149 mask_line += mask_stride;
4152 /* call prefetch hint to optimize cache load*/
4153 cache_prefetch ((__m128i*)mask);
4154 cache_prefetch ((__m128i*)dst);
4156 while (w && (unsigned long)dst & 15)
4158 uint8_t m = *mask++;
4162 *dst = pack_1x64_32 (
4164 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4175 /* call prefetch hint to optimize cache load*/
4176 cache_prefetch ((__m128i*)mask);
4177 cache_prefetch ((__m128i*)dst);
4181 /* fill cache line with next memory */
4182 cache_prefetch_next ((__m128i*)mask);
4183 cache_prefetch_next ((__m128i*)dst);
4185 m = *((uint32_t*)mask);
4187 if (srca == 0xff && m == 0xffffffff)
4189 save_128_aligned ((__m128i*)dst, xmm_def);
4193 xmm_mask = unpack_32_1x128 (m);
4194 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4197 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4199 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4200 &xmm_mask_lo, &xmm_mask_hi);
4202 pix_multiply_2x128 (&xmm_src, &xmm_src,
4203 &xmm_mask_lo, &xmm_mask_hi,
4204 &xmm_mask_lo, &xmm_mask_hi);
4207 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4211 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4221 uint8_t m = *mask++;
4225 *dst = pack_1x64_32 (
4227 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4242 /*-----------------------------------------------------------------------
4243 * composite_over_n_8_0565
4247 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4249 pixman_image_t * src_image,
4250 pixman_image_t * mask_image,
4251 pixman_image_t * dst_image,
4262 uint16_t *dst_line, *dst, d;
4263 uint8_t *mask_line, *mask;
4264 int dst_stride, mask_stride;
4267 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4269 __m128i xmm_src, xmm_alpha;
4270 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4271 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4273 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4279 PIXMAN_IMAGE_GET_LINE (
4280 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4281 PIXMAN_IMAGE_GET_LINE (
4282 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4284 xmm_src = expand_pixel_32_1x128 (src);
4285 xmm_alpha = expand_alpha_1x128 (xmm_src);
4286 mmx_src = _mm_movepi64_pi64 (xmm_src);
4287 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4292 dst_line += dst_stride;
4294 mask_line += mask_stride;
4297 /* call prefetch hint to optimize cache load*/
4298 cache_prefetch ((__m128i*)mask);
4299 cache_prefetch ((__m128i*)dst);
4301 while (w && (unsigned long)dst & 15)
4308 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4309 mmx_dest = expand565_16_1x64 (d);
4311 *dst = pack_565_32_16 (
4314 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4321 /* call prefetch hint to optimize cache load*/
4322 cache_prefetch ((__m128i*)mask);
4323 cache_prefetch ((__m128i*)dst);
4327 /* fill cache line with next memory */
4328 cache_prefetch_next ((__m128i*)mask);
4329 cache_prefetch_next ((__m128i*)dst);
4331 xmm_dst = load_128_aligned ((__m128i*) dst);
4332 unpack_565_128_4x128 (xmm_dst,
4333 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4335 m = *((uint32_t*)mask);
4340 xmm_mask = unpack_32_1x128 (m);
4341 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4344 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4346 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4347 &xmm_mask_lo, &xmm_mask_hi);
4349 in_over_2x128 (&xmm_src, &xmm_src,
4350 &xmm_alpha, &xmm_alpha,
4351 &xmm_mask_lo, &xmm_mask_hi,
4352 &xmm_dst0, &xmm_dst1);
4355 m = *((uint32_t*)mask);
4360 xmm_mask = unpack_32_1x128 (m);
4361 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4364 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4366 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4367 &xmm_mask_lo, &xmm_mask_hi);
4368 in_over_2x128 (&xmm_src, &xmm_src,
4369 &xmm_alpha, &xmm_alpha,
4370 &xmm_mask_lo, &xmm_mask_hi,
4371 &xmm_dst2, &xmm_dst3);
4375 (__m128i*)dst, pack_565_4x128_128 (
4376 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4389 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4390 mmx_dest = expand565_16_1x64 (d);
4392 *dst = pack_565_32_16 (
4395 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4406 /* -----------------------------------------------------------------------
4407 * composite_over_pixbuf_0565
4411 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4413 pixman_image_t * src_image,
4414 pixman_image_t * mask_image,
4415 pixman_image_t * dst_image,
4425 uint16_t *dst_line, *dst, d;
4426 uint32_t *src_line, *src, s;
4427 int dst_stride, src_stride;
4429 uint32_t opaque, zero;
4432 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4433 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4435 PIXMAN_IMAGE_GET_LINE (
4436 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4437 PIXMAN_IMAGE_GET_LINE (
4438 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4443 * I copy the code from MMX one and keep the fixme.
4444 * If it's a problem there, probably is a problem here.
4446 assert (src_image->drawable == mask_image->drawable);
4452 dst_line += dst_stride;
4454 src_line += src_stride;
4457 /* call prefetch hint to optimize cache load*/
4458 cache_prefetch ((__m128i*)src);
4459 cache_prefetch ((__m128i*)dst);
4461 while (w && (unsigned long)dst & 15)
4466 ms = unpack_32_1x64 (s);
4468 *dst++ = pack_565_32_16 (
4470 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4474 /* call prefetch hint to optimize cache load*/
4475 cache_prefetch ((__m128i*)src);
4476 cache_prefetch ((__m128i*)dst);
4480 /* fill cache line with next memory */
4481 cache_prefetch_next ((__m128i*)src);
4482 cache_prefetch_next ((__m128i*)dst);
4485 xmm_src = load_128_unaligned ((__m128i*)src);
4486 xmm_dst = load_128_aligned ((__m128i*)dst);
4488 opaque = is_opaque (xmm_src);
4489 zero = is_zero (xmm_src);
4491 unpack_565_128_4x128 (xmm_dst,
4492 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4493 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4495 /* preload next round*/
4496 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4500 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4501 &xmm_dst0, &xmm_dst1);
4505 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4506 &xmm_dst0, &xmm_dst1);
4510 opaque = is_opaque (xmm_src);
4511 zero = is_zero (xmm_src);
4513 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4517 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4518 &xmm_dst2, &xmm_dst3);
4522 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4523 &xmm_dst2, &xmm_dst3);
4527 (__m128i*)dst, pack_565_4x128_128 (
4528 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4540 ms = unpack_32_1x64 (s);
4542 *dst++ = pack_565_32_16 (
4544 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4552 /* -------------------------------------------------------------------------
4553 * composite_over_pixbuf_8888
4557 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4559 pixman_image_t * src_image,
4560 pixman_image_t * mask_image,
4561 pixman_image_t * dst_image,
4571 uint32_t *dst_line, *dst, d;
4572 uint32_t *src_line, *src, s;
4573 int dst_stride, src_stride;
4575 uint32_t opaque, zero;
4577 __m128i xmm_src_lo, xmm_src_hi;
4578 __m128i xmm_dst_lo, xmm_dst_hi;
4580 PIXMAN_IMAGE_GET_LINE (
4581 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4582 PIXMAN_IMAGE_GET_LINE (
4583 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4588 * I copy the code from MMX one and keep the fixme.
4589 * If it's a problem there, probably is a problem here.
4591 assert (src_image->drawable == mask_image->drawable);
4597 dst_line += dst_stride;
4599 src_line += src_stride;
4602 /* call prefetch hint to optimize cache load*/
4603 cache_prefetch ((__m128i*)src);
4604 cache_prefetch ((__m128i*)dst);
4606 while (w && (unsigned long)dst & 15)
4611 *dst++ = pack_1x64_32 (
4612 over_rev_non_pre_1x64 (
4613 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4618 /* call prefetch hint to optimize cache load*/
4619 cache_prefetch ((__m128i*)src);
4620 cache_prefetch ((__m128i*)dst);
4624 /* fill cache line with next memory */
4625 cache_prefetch_next ((__m128i*)src);
4626 cache_prefetch_next ((__m128i*)dst);
4628 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4630 opaque = is_opaque (xmm_src_hi);
4631 zero = is_zero (xmm_src_hi);
4633 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4637 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4638 &xmm_dst_lo, &xmm_dst_hi);
4641 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4645 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4647 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4649 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4650 &xmm_dst_lo, &xmm_dst_hi);
4653 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4666 *dst++ = pack_1x64_32 (
4667 over_rev_non_pre_1x64 (
4668 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4677 /* -------------------------------------------------------------------------------------------------
4678 * composite_over_n_8888_0565_ca
4682 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4684 pixman_image_t * src_image,
4685 pixman_image_t * mask_image,
4686 pixman_image_t * dst_image,
4697 uint16_t *dst_line, *dst, d;
4698 uint32_t *mask_line, *mask, m;
4699 int dst_stride, mask_stride;
4703 __m128i xmm_src, xmm_alpha;
4704 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4705 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4707 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4709 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4714 PIXMAN_IMAGE_GET_LINE (
4715 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4716 PIXMAN_IMAGE_GET_LINE (
4717 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4719 xmm_src = expand_pixel_32_1x128 (src);
4720 xmm_alpha = expand_alpha_1x128 (xmm_src);
4721 mmx_src = _mm_movepi64_pi64 (xmm_src);
4722 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4729 mask_line += mask_stride;
4730 dst_line += dst_stride;
4732 /* call prefetch hint to optimize cache load*/
4733 cache_prefetch ((__m128i*)mask);
4734 cache_prefetch ((__m128i*)dst);
4736 while (w && ((unsigned long)dst & 15))
4738 m = *(uint32_t *) mask;
4743 mmx_mask = unpack_32_1x64 (m);
4744 mmx_dest = expand565_16_1x64 (d);
4746 *dst = pack_565_32_16 (
4749 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4757 /* call prefetch hint to optimize cache load*/
4758 cache_prefetch ((__m128i*)mask);
4759 cache_prefetch ((__m128i*)dst);
4763 /* fill cache line with next memory */
4764 cache_prefetch_next ((__m128i*)mask);
4765 cache_prefetch_next ((__m128i*)dst);
4768 xmm_mask = load_128_unaligned ((__m128i*)mask);
4769 xmm_dst = load_128_aligned ((__m128i*)dst);
4771 pack_cmp = _mm_movemask_epi8 (
4772 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4774 unpack_565_128_4x128 (xmm_dst,
4775 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4776 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4778 /* preload next round */
4779 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4781 /* preload next round */
4782 if (pack_cmp != 0xffff)
4784 in_over_2x128 (&xmm_src, &xmm_src,
4785 &xmm_alpha, &xmm_alpha,
4786 &xmm_mask_lo, &xmm_mask_hi,
4787 &xmm_dst0, &xmm_dst1);
4791 pack_cmp = _mm_movemask_epi8 (
4792 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4794 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4796 if (pack_cmp != 0xffff)
4798 in_over_2x128 (&xmm_src, &xmm_src,
4799 &xmm_alpha, &xmm_alpha,
4800 &xmm_mask_lo, &xmm_mask_hi,
4801 &xmm_dst2, &xmm_dst3);
4805 (__m128i*)dst, pack_565_4x128_128 (
4806 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4815 m = *(uint32_t *) mask;
4820 mmx_mask = unpack_32_1x64 (m);
4821 mmx_dest = expand565_16_1x64 (d);
4823 *dst = pack_565_32_16 (
4826 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4838 /* -----------------------------------------------------------------------
4839 * composite_in_n_8_8
4843 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4845 pixman_image_t * src_image,
4846 pixman_image_t * mask_image,
4847 pixman_image_t * dst_image,
4857 uint8_t *dst_line, *dst;
4858 uint8_t *mask_line, *mask;
4859 int dst_stride, mask_stride;
4865 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4866 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4868 PIXMAN_IMAGE_GET_LINE (
4869 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4870 PIXMAN_IMAGE_GET_LINE (
4871 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4873 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4877 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4882 dst_line += dst_stride;
4884 mask_line += mask_stride;
4887 /* call prefetch hint to optimize cache load*/
4888 cache_prefetch ((__m128i*)mask);
4889 cache_prefetch ((__m128i*)dst);
4891 while (w && ((unsigned long)dst & 15))
4893 m = (uint32_t) *mask++;
4894 d = (uint32_t) *dst;
4896 *dst++ = (uint8_t) pack_1x64_32 (
4898 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4899 unpack_32_1x64 (m)),
4900 unpack_32_1x64 (d)));
4904 /* call prefetch hint to optimize cache load*/
4905 cache_prefetch ((__m128i*)mask);
4906 cache_prefetch ((__m128i*)dst);
4910 /* fill cache line with next memory */
4911 cache_prefetch_next ((__m128i*)mask);
4912 cache_prefetch_next ((__m128i*)dst);
4914 xmm_mask = load_128_unaligned ((__m128i*)mask);
4915 xmm_dst = load_128_aligned ((__m128i*)dst);
4917 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4918 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4920 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4921 &xmm_mask_lo, &xmm_mask_hi,
4922 &xmm_mask_lo, &xmm_mask_hi);
4924 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4925 &xmm_dst_lo, &xmm_dst_hi,
4926 &xmm_dst_lo, &xmm_dst_hi);
4929 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4938 m = (uint32_t) *mask++;
4939 d = (uint32_t) *dst;
4941 *dst++ = (uint8_t) pack_1x64_32 (
4944 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4945 unpack_32_1x64 (d)));
4953 /* ---------------------------------------------------------------------------
4958 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4960 pixman_image_t * src_image,
4961 pixman_image_t * mask_image,
4962 pixman_image_t * dst_image,
4972 uint8_t *dst_line, *dst;
4973 uint8_t *src_line, *src;
4974 int src_stride, dst_stride;
4978 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4979 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4981 PIXMAN_IMAGE_GET_LINE (
4982 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4983 PIXMAN_IMAGE_GET_LINE (
4984 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4989 dst_line += dst_stride;
4991 src_line += src_stride;
4994 /* call prefetch hint to optimize cache load*/
4995 cache_prefetch ((__m128i*)src);
4996 cache_prefetch ((__m128i*)dst);
4998 while (w && ((unsigned long)dst & 15))
5000 s = (uint32_t) *src++;
5001 d = (uint32_t) *dst;
5003 *dst++ = (uint8_t) pack_1x64_32 (
5005 unpack_32_1x64 (s), unpack_32_1x64 (d)));
5009 /* call prefetch hint to optimize cache load*/
5010 cache_prefetch ((__m128i*)src);
5011 cache_prefetch ((__m128i*)dst);
5015 /* fill cache line with next memory */
5016 cache_prefetch_next ((__m128i*)src);
5017 cache_prefetch_next ((__m128i*)dst);
5019 xmm_src = load_128_unaligned ((__m128i*)src);
5020 xmm_dst = load_128_aligned ((__m128i*)dst);
5022 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5023 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5025 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5026 &xmm_dst_lo, &xmm_dst_hi,
5027 &xmm_dst_lo, &xmm_dst_hi);
5030 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5039 s = (uint32_t) *src++;
5040 d = (uint32_t) *dst;
5042 *dst++ = (uint8_t) pack_1x64_32 (
5043 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5051 /* -------------------------------------------------------------------------
5052 * composite_add_n_8_8
5056 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
5058 pixman_image_t * src_image,
5059 pixman_image_t * mask_image,
5060 pixman_image_t * dst_image,
5070 uint8_t *dst_line, *dst;
5071 uint8_t *mask_line, *mask;
5072 int dst_stride, mask_stride;
5079 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5080 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5082 PIXMAN_IMAGE_GET_LINE (
5083 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5084 PIXMAN_IMAGE_GET_LINE (
5085 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5087 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5091 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5096 dst_line += dst_stride;
5098 mask_line += mask_stride;
5101 /* call prefetch hint to optimize cache load*/
5102 cache_prefetch ((__m128i*)mask);
5103 cache_prefetch ((__m128i*)dst);
5105 while (w && ((unsigned long)dst & 15))
5107 m = (uint32_t) *mask++;
5108 d = (uint32_t) *dst;
5110 *dst++ = (uint8_t) pack_1x64_32 (
5113 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5114 unpack_32_1x64 (d)));
5118 /* call prefetch hint to optimize cache load*/
5119 cache_prefetch ((__m128i*)mask);
5120 cache_prefetch ((__m128i*)dst);
5124 /* fill cache line with next memory */
5125 cache_prefetch_next ((__m128i*)mask);
5126 cache_prefetch_next ((__m128i*)dst);
5128 xmm_mask = load_128_unaligned ((__m128i*)mask);
5129 xmm_dst = load_128_aligned ((__m128i*)dst);
5131 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5132 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5134 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5135 &xmm_mask_lo, &xmm_mask_hi,
5136 &xmm_mask_lo, &xmm_mask_hi);
5138 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5139 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5142 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5151 m = (uint32_t) *mask++;
5152 d = (uint32_t) *dst;
5154 *dst++ = (uint8_t) pack_1x64_32 (
5157 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5158 unpack_32_1x64 (d)));
5167 /* ----------------------------------------------------------------------
5168 * composite_add_8000_8000
5172 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5174 pixman_image_t * src_image,
5175 pixman_image_t * mask_image,
5176 pixman_image_t * dst_image,
5186 uint8_t *dst_line, *dst;
5187 uint8_t *src_line, *src;
5188 int dst_stride, src_stride;
5192 PIXMAN_IMAGE_GET_LINE (
5193 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5194 PIXMAN_IMAGE_GET_LINE (
5195 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5202 /* call prefetch hint to optimize cache load*/
5203 cache_prefetch ((__m128i*)src);
5204 cache_prefetch ((__m128i*)dst);
5206 dst_line += dst_stride;
5207 src_line += src_stride;
5211 while (w && (unsigned long)dst & 3)
5213 t = (*dst) + (*src++);
5214 *dst++ = t | (0 - (t >> 8));
5218 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5228 t = (*dst) + (*src++);
5229 *dst++ = t | (0 - (t >> 8));
5237 /* ---------------------------------------------------------------------
5238 * composite_add_8888_8888
5241 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5243 pixman_image_t * src_image,
5244 pixman_image_t * mask_image,
5245 pixman_image_t * dst_image,
5255 uint32_t *dst_line, *dst;
5256 uint32_t *src_line, *src;
5257 int dst_stride, src_stride;
5259 PIXMAN_IMAGE_GET_LINE (
5260 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5261 PIXMAN_IMAGE_GET_LINE (
5262 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5267 dst_line += dst_stride;
5269 src_line += src_stride;
5271 core_combine_add_u_sse2 (dst, src, NULL, width);
5277 /* -------------------------------------------------------------------------------------------------
5278 * sse2_composite_copy_area
5281 static pixman_bool_t
5282 pixman_blt_sse2 (uint32_t *src_bits,
5295 uint8_t * src_bytes;
5296 uint8_t * dst_bytes;
5299 if (src_bpp != dst_bpp)
5304 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5305 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5306 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5307 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5308 byte_width = 2 * width;
5312 else if (src_bpp == 32)
5314 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5315 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5316 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5317 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5318 byte_width = 4 * width;
5327 cache_prefetch ((__m128i*)src_bytes);
5328 cache_prefetch ((__m128i*)dst_bytes);
5333 uint8_t *s = src_bytes;
5334 uint8_t *d = dst_bytes;
5335 src_bytes += src_stride;
5336 dst_bytes += dst_stride;
5339 cache_prefetch_next ((__m128i*)s);
5340 cache_prefetch_next ((__m128i*)d);
5342 while (w >= 2 && ((unsigned long)d & 3))
5344 *(uint16_t *)d = *(uint16_t *)s;
5350 while (w >= 4 && ((unsigned long)d & 15))
5352 *(uint32_t *)d = *(uint32_t *)s;
5359 cache_prefetch_next ((__m128i*)s);
5360 cache_prefetch_next ((__m128i*)d);
5364 __m128i xmm0, xmm1, xmm2, xmm3;
5366 /* 128 bytes ahead */
5367 cache_prefetch (((__m128i*)s) + 8);
5368 cache_prefetch (((__m128i*)d) + 8);
5370 xmm0 = load_128_unaligned ((__m128i*)(s));
5371 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5372 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5373 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5375 save_128_aligned ((__m128i*)(d), xmm0);
5376 save_128_aligned ((__m128i*)(d + 16), xmm1);
5377 save_128_aligned ((__m128i*)(d + 32), xmm2);
5378 save_128_aligned ((__m128i*)(d + 48), xmm3);
5385 cache_prefetch_next ((__m128i*)s);
5386 cache_prefetch_next ((__m128i*)d);
5390 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5397 cache_prefetch_next ((__m128i*)s);
5398 cache_prefetch_next ((__m128i*)d);
5402 *(uint32_t *)d = *(uint32_t *)s;
5411 *(uint16_t *)d = *(uint16_t *)s;
5424 sse2_composite_copy_area (pixman_implementation_t *imp,
5426 pixman_image_t * src_image,
5427 pixman_image_t * mask_image,
5428 pixman_image_t * dst_image,
5438 pixman_blt_sse2 (src_image->bits.bits,
5439 dst_image->bits.bits,
5440 src_image->bits.rowstride,
5441 dst_image->bits.rowstride,
5442 PIXMAN_FORMAT_BPP (src_image->bits.format),
5443 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5444 src_x, src_y, dest_x, dest_y, width, height);
5448 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5450 pixman_image_t * src_image,
5451 pixman_image_t * mask_image,
5452 pixman_image_t * dst_image,
5462 uint32_t *src, *src_line, s;
5463 uint32_t *dst, *dst_line, d;
5464 uint8_t *mask, *mask_line;
5466 int src_stride, mask_stride, dst_stride;
5470 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5471 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5472 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5474 PIXMAN_IMAGE_GET_LINE (
5475 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5476 PIXMAN_IMAGE_GET_LINE (
5477 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5478 PIXMAN_IMAGE_GET_LINE (
5479 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5484 src_line += src_stride;
5486 dst_line += dst_stride;
5488 mask_line += mask_stride;
5492 /* call prefetch hint to optimize cache load*/
5493 cache_prefetch ((__m128i*)src);
5494 cache_prefetch ((__m128i*)dst);
5495 cache_prefetch ((__m128i*)mask);
5497 while (w && (unsigned long)dst & 15)
5499 s = 0xff000000 | *src++;
5500 m = (uint32_t) *mask++;
5504 ms = unpack_32_1x64 (s);
5508 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5509 __m64 md = unpack_32_1x64 (d);
5511 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5514 *dst++ = pack_1x64_32 (ms);
5518 /* call prefetch hint to optimize cache load*/
5519 cache_prefetch ((__m128i*)src);
5520 cache_prefetch ((__m128i*)dst);
5521 cache_prefetch ((__m128i*)mask);
5525 /* fill cache line with next memory */
5526 cache_prefetch_next ((__m128i*)src);
5527 cache_prefetch_next ((__m128i*)dst);
5528 cache_prefetch_next ((__m128i*)mask);
5530 m = *(uint32_t*) mask;
5531 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5533 if (m == 0xffffffff)
5535 save_128_aligned ((__m128i*)dst, xmm_src);
5539 xmm_dst = load_128_aligned ((__m128i*)dst);
5541 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5543 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5544 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5545 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5547 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5549 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5551 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5562 m = (uint32_t) *mask++;
5566 s = 0xff000000 | *src;
5578 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5579 md = unpack_32_1x64 (d);
5580 ms = unpack_32_1x64 (s);
5582 *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5596 static const pixman_fast_path_t sse2_fast_paths[] =
5598 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 },
5599 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 },
5600 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 },
5601 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 },
5602 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 },
5603 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
5604 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
5605 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
5606 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
5607 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
5608 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
5609 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5610 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5611 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5612 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5613 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5614 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5615 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
5616 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
5617 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5618 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5619 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5620 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5621 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5622 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5623 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5624 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5625 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5626 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5627 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5628 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5629 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5630 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5631 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5632 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5633 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5634 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5635 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5636 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5637 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5638 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5639 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5640 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5641 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5642 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5643 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5644 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5646 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5647 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 },
5648 { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 },
5649 { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 },
5650 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_n_8_8, 0 },
5652 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5653 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5654 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5655 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5656 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 },
5657 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 },
5658 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5659 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5660 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5661 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5662 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 },
5663 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 },
5665 { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 },
5666 { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 },
5672 * Work around GCC bug causing crashes in Mozilla with SSE2
5674 * When using -msse, gcc generates movdqa instructions assuming that
5675 * the stack is 16 byte aligned. Unfortunately some applications, such
5676 * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
5677 * causes the movdqa instructions to fail.
5679 * The __force_align_arg_pointer__ makes gcc generate a prologue that
5680 * realigns the stack pointer to 16 bytes.
5682 * On x86-64 this is not necessary because the standard ABI already
5683 * calls for a 16 byte aligned stack.
5685 * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
5687 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5688 __attribute__((__force_align_arg_pointer__))
5691 sse2_composite (pixman_implementation_t *imp,
5693 pixman_image_t * src,
5694 pixman_image_t * mask,
5695 pixman_image_t * dest,
5705 if (_pixman_run_fast_path (sse2_fast_paths, imp,
5706 op, src, mask, dest,
5715 _pixman_implementation_composite (imp->delegate, op,
5723 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5724 __attribute__((__force_align_arg_pointer__))
5726 static pixman_bool_t
5727 sse2_blt (pixman_implementation_t *imp,
5728 uint32_t * src_bits,
5729 uint32_t * dst_bits,
5741 if (!pixman_blt_sse2 (
5742 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5743 src_x, src_y, dst_x, dst_y, width, height))
5746 return _pixman_implementation_blt (
5748 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5749 src_x, src_y, dst_x, dst_y, width, height);
5755 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5756 __attribute__((__force_align_arg_pointer__))
5758 static pixman_bool_t
5759 sse2_fill (pixman_implementation_t *imp,
5769 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5771 return _pixman_implementation_fill (
5772 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5778 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5779 __attribute__((__force_align_arg_pointer__))
5781 pixman_implementation_t *
5782 _pixman_implementation_create_sse2 (void)
5784 pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5785 pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5787 /* SSE2 constants */
5788 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5789 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5790 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5791 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5792 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5793 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5794 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5795 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5796 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
5797 mask_0080 = create_mask_16_128 (0x0080);
5798 mask_00ff = create_mask_16_128 (0x00ff);
5799 mask_0101 = create_mask_16_128 (0x0101);
5800 mask_ffff = create_mask_16_128 (0xffff);
5801 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5802 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5805 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5806 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5808 mask_x0080 = create_mask_16_64 (0x0080);
5809 mask_x00ff = create_mask_16_64 (0x00ff);
5810 mask_x0101 = create_mask_16_64 (0x0101);
5811 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5815 /* Set up function pointers */
5817 /* SSE code patch for fbcompose.c */
5818 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5819 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5820 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5821 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5822 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5823 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5824 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5825 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5826 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5827 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5829 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5831 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5832 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5833 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5834 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5835 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5836 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5837 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5838 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5839 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5840 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5841 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5843 imp->composite = sse2_composite;
5844 imp->blt = sse2_blt;
5845 imp->fill = sse2_fill;
5850 #endif /* USE_SSE2 */