2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41 * the pixman-x64-mmx-emulation.h file contains
42 * implementations of those MMX intrinsics that
43 * are used in the SSE2 implementation.
45 # include "pixman-x64-mmx-emulation.h"
50 /* --------------------------------------------------------------------
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
79 /* ----------------------------------------------------------------------
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
85 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
91 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
98 __m128i r, g, b, rb, t;
100 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
104 rb = _mm_or_si128 (r, b);
105 t = _mm_and_si128 (rb, mask_565_fix_rb);
106 t = _mm_srli_epi32 (t, 5);
107 rb = _mm_or_si128 (rb, t);
109 t = _mm_and_si128 (g, mask_565_fix_g);
110 t = _mm_srli_epi32 (t, 6);
111 g = _mm_or_si128 (g, t);
113 return _mm_or_si128 (rb, g);
116 static force_inline void
117 unpack_565_128_4x128 (__m128i data,
125 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
128 lo = unpack_565_to_8888 (lo);
129 hi = unpack_565_to_8888 (hi);
131 unpack_128_2x128 (lo, data0, data1);
132 unpack_128_2x128 (hi, data2, data3);
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
138 return (uint16_t) (((pixel >> 8) & 0xf800) |
139 ((pixel >> 5) & 0x07e0) |
140 ((pixel >> 3) & 0x001f));
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
146 return _mm_packus_epi16 (lo, hi);
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
153 __m128i r, g1, g2, b;
155 data = pack_2x128_128 (lo, hi);
157 r = _mm_and_si128 (data, mask_565_r);
158 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
162 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
168 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169 pack_565_2x128_128 (*xmm2, *xmm3));
172 static force_inline int
173 is_opaque (__m128i x)
175 __m128i ffs = _mm_cmpeq_epi8 (x, x);
177 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
180 static force_inline int
183 return _mm_movemask_epi8 (
184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
187 static force_inline int
188 is_transparent (__m128i x)
190 return (_mm_movemask_epi8 (
191 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
197 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
203 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204 _MM_SHUFFLE (3, 3, 3, 3)),
205 _MM_SHUFFLE (3, 3, 3, 3));
208 static force_inline void
209 expand_alpha_2x128 (__m128i data_lo,
216 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
219 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i data_lo,
231 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
247 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249 lo = _mm_adds_epu16 (lo, mask_0080);
250 hi = _mm_adds_epu16 (hi, mask_0080);
251 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
258 __m128i* alpha_dst_lo,
259 __m128i* alpha_dst_hi,
262 __m128i* alpha_src_lo,
263 __m128i* alpha_src_hi,
267 __m128i t1_lo, t1_hi;
268 __m128i t2_lo, t2_hi;
270 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
273 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
277 static force_inline void
278 negate_2x128 (__m128i data_lo,
283 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
287 static force_inline void
288 invert_colors_2x128 (__m128i data_lo,
295 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
311 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
313 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
315 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i src_lo,
326 __m128i alpha_lo, alpha_hi;
328 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
330 lo = _mm_or_si128 (alpha_lo, mask_alpha);
331 hi = _mm_or_si128 (alpha_hi, mask_alpha);
333 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
335 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
337 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
353 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
356 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
359 static force_inline void
360 cache_prefetch (__m128i* addr)
362 _mm_prefetch ((void const*)addr, _MM_HINT_T0);
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
368 _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
371 /* prefetching NULL is very slow on some systems. don't do that. */
373 static force_inline void
374 maybe_prefetch (__m128i* addr)
377 cache_prefetch (addr);
380 static force_inline void
381 maybe_prefetch_next (__m128i* addr)
384 cache_prefetch_next (addr);
387 /* load 4 pixels from a 16-byte boundary aligned address */
388 static force_inline __m128i
389 load_128_aligned (__m128i* src)
391 return _mm_load_si128 (src);
394 /* load 4 pixels from a unaligned address */
395 static force_inline __m128i
396 load_128_unaligned (const __m128i* src)
398 return _mm_loadu_si128 (src);
401 /* save 4 pixels using Write Combining memory on a 16-byte
402 * boundary aligned address
404 static force_inline void
405 save_128_write_combining (__m128i* dst,
408 _mm_stream_si128 (dst, data);
411 /* save 4 pixels on a 16-byte boundary aligned address */
412 static force_inline void
413 save_128_aligned (__m128i* dst,
416 _mm_store_si128 (dst, data);
419 /* save 4 pixels on a unaligned address */
420 static force_inline void
421 save_128_unaligned (__m128i* dst,
424 _mm_storeu_si128 (dst, data);
427 /* ------------------------------------------------------------------
431 static force_inline __m64
432 load_32_1x64 (uint32_t data)
434 return _mm_cvtsi32_si64 (data);
437 static force_inline __m64
438 unpack_32_1x64 (uint32_t data)
440 return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
443 static force_inline __m64
444 expand_alpha_1x64 (__m64 data)
446 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
449 static force_inline __m64
450 expand_alpha_rev_1x64 (__m64 data)
452 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
455 static force_inline __m64
456 expand_pixel_8_1x64 (uint8_t data)
458 return _mm_shuffle_pi16 (
459 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
462 static force_inline __m64
463 pix_multiply_1x64 (__m64 data,
466 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
471 static force_inline __m64
472 pix_add_multiply_1x64 (__m64* src,
477 __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
478 __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
480 return _mm_adds_pu8 (t1, t2);
483 static force_inline __m64
484 negate_1x64 (__m64 data)
486 return _mm_xor_si64 (data, mask_x00ff);
489 static force_inline __m64
490 invert_colors_1x64 (__m64 data)
492 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
495 static force_inline __m64
496 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
498 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
501 static force_inline __m64
502 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
504 return over_1x64 (pix_multiply_1x64 (*src, *mask),
505 pix_multiply_1x64 (*alpha, *mask),
509 static force_inline __m64
510 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
512 __m64 alpha = expand_alpha_1x64 (src);
514 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
515 _mm_or_si64 (alpha, mask_x_alpha)),
520 static force_inline uint32_t
521 pack_1x64_32 (__m64 data)
523 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
526 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
530 * --- Expanding 565 in the low word ---
532 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
533 * m = m & (01f0003f001f);
534 * m = m * (008404100840);
537 * Note the trick here - the top word is shifted by another nibble to
538 * avoid it bumping into the middle word
540 static force_inline __m64
541 expand565_16_1x64 (uint16_t pixel)
546 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
548 t1 = _mm_slli_si64 (p, 36 - 11);
549 t2 = _mm_slli_si64 (p, 16 - 5);
551 p = _mm_or_si64 (t1, p);
552 p = _mm_or_si64 (t2, p);
553 p = _mm_and_si64 (p, mask_x565_rgb);
554 p = _mm_mullo_pi16 (p, mask_x565_unpack);
556 return _mm_srli_pi16 (p, 8);
559 /* ----------------------------------------------------------------------------
560 * Compose Core transformations
562 static force_inline uint32_t
563 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
576 ms = unpack_32_1x64 (src);
577 return pack_1x64_32 (
578 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
584 static force_inline uint32_t
585 combine1 (const uint32_t *ps, const uint32_t *pm)
593 mm = unpack_32_1x64 (*pm);
594 mm = expand_alpha_1x64 (mm);
596 ms = unpack_32_1x64 (s);
597 ms = pix_multiply_1x64 (ms, mm);
599 s = pack_1x64_32 (ms);
605 static force_inline __m128i
606 combine4 (const __m128i *ps, const __m128i *pm)
608 __m128i xmm_src_lo, xmm_src_hi;
609 __m128i xmm_msk_lo, xmm_msk_hi;
614 xmm_msk_lo = load_128_unaligned (pm);
616 if (is_transparent (xmm_msk_lo))
617 return _mm_setzero_si128 ();
620 s = load_128_unaligned (ps);
624 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
625 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
627 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
629 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
630 &xmm_msk_lo, &xmm_msk_hi,
631 &xmm_src_lo, &xmm_src_hi);
633 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
639 static force_inline void
640 core_combine_over_u_sse2 (uint32_t* pd,
647 __m128i xmm_dst_lo, xmm_dst_hi;
648 __m128i xmm_src_lo, xmm_src_hi;
649 __m128i xmm_alpha_lo, xmm_alpha_hi;
651 /* call prefetch hint to optimize cache load*/
652 cache_prefetch ((__m128i*)ps);
653 cache_prefetch ((__m128i*)pd);
654 maybe_prefetch ((__m128i*)pm);
656 /* Align dst on a 16-byte boundary */
657 while (w && ((unsigned long)pd & 15))
660 s = combine1 (ps, pm);
662 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
669 /* call prefetch hint to optimize cache load*/
670 cache_prefetch ((__m128i*)ps);
671 cache_prefetch ((__m128i*)pd);
672 maybe_prefetch ((__m128i*)pm);
676 /* fill cache line with next memory */
677 cache_prefetch_next ((__m128i*)ps);
678 cache_prefetch_next ((__m128i*)pd);
679 maybe_prefetch_next ((__m128i*)pm);
681 /* I'm loading unaligned because I'm not sure about
682 * the address alignment.
684 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
686 if (is_opaque (xmm_src_hi))
688 save_128_aligned ((__m128i*)pd, xmm_src_hi);
690 else if (!is_zero (xmm_src_hi))
692 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
694 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
695 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
698 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
700 over_2x128 (&xmm_src_lo, &xmm_src_hi,
701 &xmm_alpha_lo, &xmm_alpha_hi,
702 &xmm_dst_lo, &xmm_dst_hi);
704 /* rebuid the 4 pixel data and save*/
705 save_128_aligned ((__m128i*)pd,
706 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
719 s = combine1 (ps, pm);
721 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
730 static force_inline void
731 core_combine_over_reverse_u_sse2 (uint32_t* pd,
738 __m128i xmm_dst_lo, xmm_dst_hi;
739 __m128i xmm_src_lo, xmm_src_hi;
740 __m128i xmm_alpha_lo, xmm_alpha_hi;
742 /* call prefetch hint to optimize cache load*/
743 cache_prefetch ((__m128i*)ps);
744 cache_prefetch ((__m128i*)pd);
745 maybe_prefetch ((__m128i*)pm);
747 /* Align dst on a 16-byte boundary */
749 ((unsigned long)pd & 15))
752 s = combine1 (ps, pm);
754 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
761 /* call prefetch hint to optimize cache load*/
762 cache_prefetch ((__m128i*)ps);
763 cache_prefetch ((__m128i*)pd);
764 maybe_prefetch ((__m128i*)pm);
768 /* fill cache line with next memory */
769 cache_prefetch_next ((__m128i*)ps);
770 cache_prefetch_next ((__m128i*)pd);
771 maybe_prefetch_next ((__m128i*)pm);
773 /* I'm loading unaligned because I'm not sure
774 * about the address alignment.
776 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
777 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
779 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
780 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
782 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
783 &xmm_alpha_lo, &xmm_alpha_hi);
785 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
786 &xmm_alpha_lo, &xmm_alpha_hi,
787 &xmm_src_lo, &xmm_src_hi);
789 /* rebuid the 4 pixel data and save*/
790 save_128_aligned ((__m128i*)pd,
791 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
804 s = combine1 (ps, pm);
806 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
814 static force_inline uint32_t
815 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
817 uint32_t maska = src >> 24;
823 else if (maska != 0xff)
825 return pack_1x64_32 (
826 pix_multiply_1x64 (unpack_32_1x64 (dst),
827 expand_alpha_1x64 (unpack_32_1x64 (src))));
833 static force_inline void
834 core_combine_in_u_sse2 (uint32_t* pd,
841 __m128i xmm_src_lo, xmm_src_hi;
842 __m128i xmm_dst_lo, xmm_dst_hi;
844 /* call prefetch hint to optimize cache load*/
845 cache_prefetch ((__m128i*)ps);
846 cache_prefetch ((__m128i*)pd);
847 maybe_prefetch ((__m128i*)pm);
849 while (w && ((unsigned long) pd & 15))
851 s = combine1 (ps, pm);
854 *pd++ = core_combine_in_u_pixelsse2 (d, s);
861 /* call prefetch hint to optimize cache load*/
862 cache_prefetch ((__m128i*)ps);
863 cache_prefetch ((__m128i*)pd);
864 maybe_prefetch ((__m128i*)pm);
868 /* fill cache line with next memory */
869 cache_prefetch_next ((__m128i*)ps);
870 cache_prefetch_next ((__m128i*)pd);
871 maybe_prefetch_next ((__m128i*)pm);
873 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
874 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
876 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
877 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
879 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
880 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
881 &xmm_dst_lo, &xmm_dst_hi,
882 &xmm_dst_lo, &xmm_dst_hi);
884 save_128_aligned ((__m128i*)pd,
885 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
896 s = combine1 (ps, pm);
899 *pd++ = core_combine_in_u_pixelsse2 (d, s);
907 static force_inline void
908 core_combine_reverse_in_u_sse2 (uint32_t* pd,
915 __m128i xmm_src_lo, xmm_src_hi;
916 __m128i xmm_dst_lo, xmm_dst_hi;
918 /* call prefetch hint to optimize cache load*/
919 cache_prefetch ((__m128i*)ps);
920 cache_prefetch ((__m128i*)pd);
921 maybe_prefetch ((__m128i*)pm);
923 while (w && ((unsigned long) pd & 15))
925 s = combine1 (ps, pm);
928 *pd++ = core_combine_in_u_pixelsse2 (s, d);
935 /* call prefetch hint to optimize cache load*/
936 cache_prefetch ((__m128i*)ps);
937 cache_prefetch ((__m128i*)pd);
938 maybe_prefetch ((__m128i*)pm);
942 /* fill cache line with next memory */
943 cache_prefetch_next ((__m128i*)ps);
944 cache_prefetch_next ((__m128i*)pd);
945 maybe_prefetch_next ((__m128i*)pm);
947 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
948 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
950 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
951 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
953 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
954 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
955 &xmm_src_lo, &xmm_src_hi,
956 &xmm_dst_lo, &xmm_dst_hi);
959 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
970 s = combine1 (ps, pm);
973 *pd++ = core_combine_in_u_pixelsse2 (s, d);
981 static force_inline void
982 core_combine_reverse_out_u_sse2 (uint32_t* pd,
987 /* call prefetch hint to optimize cache load*/
988 cache_prefetch ((__m128i*)ps);
989 cache_prefetch ((__m128i*)pd);
990 maybe_prefetch ((__m128i*)pm);
992 while (w && ((unsigned long) pd & 15))
994 uint32_t s = combine1 (ps, pm);
997 *pd++ = pack_1x64_32 (
999 unpack_32_1x64 (d), negate_1x64 (
1000 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1008 /* call prefetch hint to optimize cache load*/
1009 cache_prefetch ((__m128i*)ps);
1010 cache_prefetch ((__m128i*)pd);
1011 maybe_prefetch ((__m128i*)pm);
1015 __m128i xmm_src_lo, xmm_src_hi;
1016 __m128i xmm_dst_lo, xmm_dst_hi;
1018 /* fill cache line with next memory */
1019 cache_prefetch_next ((__m128i*)ps);
1020 cache_prefetch_next ((__m128i*)pd);
1021 maybe_prefetch_next ((__m128i*)pm);
1023 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1024 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1026 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1027 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1029 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1030 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1032 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1033 &xmm_src_lo, &xmm_src_hi,
1034 &xmm_dst_lo, &xmm_dst_hi);
1037 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1049 uint32_t s = combine1 (ps, pm);
1052 *pd++ = pack_1x64_32 (
1054 unpack_32_1x64 (d), negate_1x64 (
1055 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1063 static force_inline void
1064 core_combine_out_u_sse2 (uint32_t* pd,
1069 /* call prefetch hint to optimize cache load*/
1070 cache_prefetch ((__m128i*)ps);
1071 cache_prefetch ((__m128i*)pd);
1072 maybe_prefetch ((__m128i*)pm);
1074 while (w && ((unsigned long) pd & 15))
1076 uint32_t s = combine1 (ps, pm);
1079 *pd++ = pack_1x64_32 (
1081 unpack_32_1x64 (s), negate_1x64 (
1082 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1089 /* call prefetch hint to optimize cache load*/
1090 cache_prefetch ((__m128i*)ps);
1091 cache_prefetch ((__m128i*)pd);
1092 maybe_prefetch ((__m128i*)pm);
1096 __m128i xmm_src_lo, xmm_src_hi;
1097 __m128i xmm_dst_lo, xmm_dst_hi;
1099 /* fill cache line with next memory */
1100 cache_prefetch_next ((__m128i*)ps);
1101 cache_prefetch_next ((__m128i*)pd);
1102 maybe_prefetch_next ((__m128i*)pm);
1104 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1105 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1107 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1108 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1110 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1111 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1113 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1114 &xmm_dst_lo, &xmm_dst_hi,
1115 &xmm_dst_lo, &xmm_dst_hi);
1118 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1129 uint32_t s = combine1 (ps, pm);
1132 *pd++ = pack_1x64_32 (
1134 unpack_32_1x64 (s), negate_1x64 (
1135 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1143 static force_inline uint32_t
1144 core_combine_atop_u_pixel_sse2 (uint32_t src,
1147 __m64 s = unpack_32_1x64 (src);
1148 __m64 d = unpack_32_1x64 (dst);
1150 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1151 __m64 da = expand_alpha_1x64 (d);
1153 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1156 static force_inline void
1157 core_combine_atop_u_sse2 (uint32_t* pd,
1164 __m128i xmm_src_lo, xmm_src_hi;
1165 __m128i xmm_dst_lo, xmm_dst_hi;
1166 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1167 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1169 /* call prefetch hint to optimize cache load*/
1170 cache_prefetch ((__m128i*)ps);
1171 cache_prefetch ((__m128i*)pd);
1172 maybe_prefetch ((__m128i*)pm);
1174 while (w && ((unsigned long) pd & 15))
1176 s = combine1 (ps, pm);
1179 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1186 /* call prefetch hint to optimize cache load*/
1187 cache_prefetch ((__m128i*)ps);
1188 cache_prefetch ((__m128i*)pd);
1189 maybe_prefetch ((__m128i*)pm);
1193 /* fill cache line with next memory */
1194 cache_prefetch_next ((__m128i*)ps);
1195 cache_prefetch_next ((__m128i*)pd);
1196 maybe_prefetch_next ((__m128i*)pm);
1198 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1199 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1201 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1202 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1204 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1205 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1206 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1207 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1209 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1210 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1212 pix_add_multiply_2x128 (
1213 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1214 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1215 &xmm_dst_lo, &xmm_dst_hi);
1218 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1229 s = combine1 (ps, pm);
1232 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1240 static force_inline uint32_t
1241 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1244 __m64 s = unpack_32_1x64 (src);
1245 __m64 d = unpack_32_1x64 (dst);
1247 __m64 sa = expand_alpha_1x64 (s);
1248 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1250 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1253 static force_inline void
1254 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1261 __m128i xmm_src_lo, xmm_src_hi;
1262 __m128i xmm_dst_lo, xmm_dst_hi;
1263 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1264 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1266 /* call prefetch hint to optimize cache load*/
1267 cache_prefetch ((__m128i*)ps);
1268 cache_prefetch ((__m128i*)pd);
1269 maybe_prefetch ((__m128i*)pm);
1271 while (w && ((unsigned long) pd & 15))
1273 s = combine1 (ps, pm);
1276 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1283 /* call prefetch hint to optimize cache load*/
1284 cache_prefetch ((__m128i*)ps);
1285 cache_prefetch ((__m128i*)pd);
1286 maybe_prefetch ((__m128i*)pm);
1290 /* fill cache line with next memory */
1291 cache_prefetch_next ((__m128i*)ps);
1292 cache_prefetch_next ((__m128i*)pd);
1293 maybe_prefetch_next ((__m128i*)pm);
1295 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1296 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1298 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1299 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1301 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1302 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1303 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1304 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1306 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1307 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1309 pix_add_multiply_2x128 (
1310 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1311 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1312 &xmm_dst_lo, &xmm_dst_hi);
1315 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1326 s = combine1 (ps, pm);
1329 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1337 static force_inline uint32_t
1338 core_combine_xor_u_pixel_sse2 (uint32_t src,
1341 __m64 s = unpack_32_1x64 (src);
1342 __m64 d = unpack_32_1x64 (dst);
1344 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1345 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1347 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1350 static force_inline void
1351 core_combine_xor_u_sse2 (uint32_t* dst,
1352 const uint32_t* src,
1353 const uint32_t *mask,
1359 const uint32_t* ps = src;
1360 const uint32_t* pm = mask;
1362 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1363 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1364 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1365 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1367 /* call prefetch hint to optimize cache load*/
1368 cache_prefetch ((__m128i*)ps);
1369 cache_prefetch ((__m128i*)pd);
1370 maybe_prefetch ((__m128i*)pm);
1372 while (w && ((unsigned long) pd & 15))
1374 s = combine1 (ps, pm);
1377 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1384 /* call prefetch hint to optimize cache load*/
1385 cache_prefetch ((__m128i*)ps);
1386 cache_prefetch ((__m128i*)pd);
1387 maybe_prefetch ((__m128i*)pm);
1391 /* fill cache line with next memory */
1392 cache_prefetch_next ((__m128i*)ps);
1393 cache_prefetch_next ((__m128i*)pd);
1394 maybe_prefetch_next ((__m128i*)pm);
1396 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1397 xmm_dst = load_128_aligned ((__m128i*) pd);
1399 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1400 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1402 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1403 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1404 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1405 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1407 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1408 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1409 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1410 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1412 pix_add_multiply_2x128 (
1413 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1414 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1415 &xmm_dst_lo, &xmm_dst_hi);
1418 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1429 s = combine1 (ps, pm);
1432 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1440 static force_inline void
1441 core_combine_add_u_sse2 (uint32_t* dst,
1442 const uint32_t* src,
1443 const uint32_t* mask,
1449 const uint32_t* ps = src;
1450 const uint32_t* pm = mask;
1452 /* call prefetch hint to optimize cache load*/
1453 cache_prefetch ((__m128i*)ps);
1454 cache_prefetch ((__m128i*)pd);
1455 maybe_prefetch ((__m128i*)pm);
1457 while (w && (unsigned long)pd & 15)
1459 s = combine1 (ps, pm);
1465 *pd++ = _mm_cvtsi64_si32 (
1466 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1470 /* call prefetch hint to optimize cache load*/
1471 cache_prefetch ((__m128i*)ps);
1472 cache_prefetch ((__m128i*)pd);
1473 maybe_prefetch ((__m128i*)pm);
1479 /* fill cache line with next memory */
1480 cache_prefetch_next ((__m128i*)ps);
1481 cache_prefetch_next ((__m128i*)pd);
1482 maybe_prefetch_next ((__m128i*)pm);
1484 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1487 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1498 s = combine1 (ps, pm);
1502 *pd++ = _mm_cvtsi64_si32 (
1503 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1509 static force_inline uint32_t
1510 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1513 __m64 ms = unpack_32_1x64 (src);
1514 __m64 md = unpack_32_1x64 (dst);
1515 uint32_t sa = src >> 24;
1516 uint32_t da = ~dst >> 24;
1520 ms = pix_multiply_1x64 (
1521 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1524 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1527 static force_inline void
1528 core_combine_saturate_u_sse2 (uint32_t * pd,
1536 __m128i xmm_src, xmm_dst;
1538 /* call prefetch hint to optimize cache load*/
1539 cache_prefetch ((__m128i*)ps);
1540 cache_prefetch ((__m128i*)pd);
1541 maybe_prefetch ((__m128i*)pm);
1543 while (w && (unsigned long)pd & 15)
1545 s = combine1 (ps, pm);
1548 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1555 /* call prefetch hint to optimize cache load*/
1556 cache_prefetch ((__m128i*)ps);
1557 cache_prefetch ((__m128i*)pd);
1558 maybe_prefetch ((__m128i*)pm);
1562 /* fill cache line with next memory */
1563 cache_prefetch_next ((__m128i*)ps);
1564 cache_prefetch_next ((__m128i*)pd);
1565 maybe_prefetch_next ((__m128i*)pm);
1567 xmm_dst = load_128_aligned ((__m128i*)pd);
1568 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1570 pack_cmp = _mm_movemask_epi8 (
1572 _mm_srli_epi32 (xmm_src, 24),
1573 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1575 /* if some alpha src is grater than respective ~alpha dst */
1578 s = combine1 (ps++, pm);
1580 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1584 s = combine1 (ps++, pm);
1586 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1590 s = combine1 (ps++, pm);
1592 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1596 s = combine1 (ps++, pm);
1598 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1604 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1617 s = combine1 (ps, pm);
1620 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1627 static force_inline void
1628 core_combine_src_ca_sse2 (uint32_t* pd,
1635 __m128i xmm_src_lo, xmm_src_hi;
1636 __m128i xmm_mask_lo, xmm_mask_hi;
1637 __m128i xmm_dst_lo, xmm_dst_hi;
1639 /* call prefetch hint to optimize cache load*/
1640 cache_prefetch ((__m128i*)ps);
1641 cache_prefetch ((__m128i*)pd);
1642 cache_prefetch ((__m128i*)pm);
1644 while (w && (unsigned long)pd & 15)
1648 *pd++ = pack_1x64_32 (
1649 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1653 /* call prefetch hint to optimize cache load*/
1654 cache_prefetch ((__m128i*)ps);
1655 cache_prefetch ((__m128i*)pd);
1656 cache_prefetch ((__m128i*)pm);
1660 /* fill cache line with next memory */
1661 cache_prefetch_next ((__m128i*)ps);
1662 cache_prefetch_next ((__m128i*)pd);
1663 cache_prefetch_next ((__m128i*)pm);
1665 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1666 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1668 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1669 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1671 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1672 &xmm_mask_lo, &xmm_mask_hi,
1673 &xmm_dst_lo, &xmm_dst_hi);
1676 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1688 *pd++ = pack_1x64_32 (
1689 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1694 static force_inline uint32_t
1695 core_combine_over_ca_pixel_sse2 (uint32_t src,
1699 __m64 s = unpack_32_1x64 (src);
1700 __m64 expAlpha = expand_alpha_1x64 (s);
1701 __m64 unpk_mask = unpack_32_1x64 (mask);
1702 __m64 unpk_dst = unpack_32_1x64 (dst);
1704 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1707 static force_inline void
1708 core_combine_over_ca_sse2 (uint32_t* pd,
1715 __m128i xmm_alpha_lo, xmm_alpha_hi;
1716 __m128i xmm_src_lo, xmm_src_hi;
1717 __m128i xmm_dst_lo, xmm_dst_hi;
1718 __m128i xmm_mask_lo, xmm_mask_hi;
1720 /* call prefetch hint to optimize cache load*/
1721 cache_prefetch ((__m128i*)ps);
1722 cache_prefetch ((__m128i*)pd);
1723 cache_prefetch ((__m128i*)pm);
1725 while (w && (unsigned long)pd & 15)
1731 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1735 /* call prefetch hint to optimize cache load*/
1736 cache_prefetch ((__m128i*)ps);
1737 cache_prefetch ((__m128i*)pd);
1738 cache_prefetch ((__m128i*)pm);
1742 /* fill cache line with next memory */
1743 cache_prefetch_next ((__m128i*)ps);
1744 cache_prefetch_next ((__m128i*)pd);
1745 cache_prefetch_next ((__m128i*)pm);
1747 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1748 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1749 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1751 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1752 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1753 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1755 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1756 &xmm_alpha_lo, &xmm_alpha_hi);
1758 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1759 &xmm_alpha_lo, &xmm_alpha_hi,
1760 &xmm_mask_lo, &xmm_mask_hi,
1761 &xmm_dst_lo, &xmm_dst_hi);
1764 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1778 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1783 static force_inline uint32_t
1784 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1788 __m64 d = unpack_32_1x64 (dst);
1790 return pack_1x64_32 (
1791 over_1x64 (d, expand_alpha_1x64 (d),
1792 pix_multiply_1x64 (unpack_32_1x64 (src),
1793 unpack_32_1x64 (mask))));
1796 static force_inline void
1797 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1804 __m128i xmm_alpha_lo, xmm_alpha_hi;
1805 __m128i xmm_src_lo, xmm_src_hi;
1806 __m128i xmm_dst_lo, xmm_dst_hi;
1807 __m128i xmm_mask_lo, xmm_mask_hi;
1809 /* call prefetch hint to optimize cache load*/
1810 cache_prefetch ((__m128i*)ps);
1811 cache_prefetch ((__m128i*)pd);
1812 cache_prefetch ((__m128i*)pm);
1814 while (w && (unsigned long)pd & 15)
1820 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1824 /* call prefetch hint to optimize cache load*/
1825 cache_prefetch ((__m128i*)ps);
1826 cache_prefetch ((__m128i*)pd);
1827 cache_prefetch ((__m128i*)pm);
1831 /* fill cache line with next memory */
1832 cache_prefetch_next ((__m128i*)ps);
1833 cache_prefetch_next ((__m128i*)pd);
1834 cache_prefetch_next ((__m128i*)pm);
1836 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1837 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1838 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1840 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1841 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1842 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1844 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1845 &xmm_alpha_lo, &xmm_alpha_hi);
1846 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1847 &xmm_mask_lo, &xmm_mask_hi,
1848 &xmm_mask_lo, &xmm_mask_hi);
1850 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1851 &xmm_alpha_lo, &xmm_alpha_hi,
1852 &xmm_mask_lo, &xmm_mask_hi);
1855 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1869 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1874 static force_inline void
1875 core_combine_in_ca_sse2 (uint32_t * pd,
1882 __m128i xmm_alpha_lo, xmm_alpha_hi;
1883 __m128i xmm_src_lo, xmm_src_hi;
1884 __m128i xmm_dst_lo, xmm_dst_hi;
1885 __m128i xmm_mask_lo, xmm_mask_hi;
1887 /* call prefetch hint to optimize cache load*/
1888 cache_prefetch ((__m128i*)ps);
1889 cache_prefetch ((__m128i*)pd);
1890 cache_prefetch ((__m128i*)pm);
1892 while (w && (unsigned long)pd & 15)
1898 *pd++ = pack_1x64_32 (
1900 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1901 expand_alpha_1x64 (unpack_32_1x64 (d))));
1906 /* call prefetch hint to optimize cache load*/
1907 cache_prefetch ((__m128i*)ps);
1908 cache_prefetch ((__m128i*)pd);
1909 cache_prefetch ((__m128i*)pm);
1913 /* fill cache line with next memory */
1914 cache_prefetch_next ((__m128i*)ps);
1915 cache_prefetch_next ((__m128i*)pd);
1916 cache_prefetch_next ((__m128i*)pm);
1918 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1919 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1920 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1922 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1923 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1924 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1926 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1927 &xmm_alpha_lo, &xmm_alpha_hi);
1929 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1930 &xmm_mask_lo, &xmm_mask_hi,
1931 &xmm_dst_lo, &xmm_dst_hi);
1933 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1934 &xmm_alpha_lo, &xmm_alpha_hi,
1935 &xmm_dst_lo, &xmm_dst_hi);
1938 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1952 *pd++ = pack_1x64_32 (
1955 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1956 expand_alpha_1x64 (unpack_32_1x64 (d))));
1962 static force_inline void
1963 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1970 __m128i xmm_alpha_lo, xmm_alpha_hi;
1971 __m128i xmm_src_lo, xmm_src_hi;
1972 __m128i xmm_dst_lo, xmm_dst_hi;
1973 __m128i xmm_mask_lo, xmm_mask_hi;
1975 /* call prefetch hint to optimize cache load*/
1976 cache_prefetch ((__m128i*)ps);
1977 cache_prefetch ((__m128i*)pd);
1978 cache_prefetch ((__m128i*)pm);
1980 while (w && (unsigned long)pd & 15)
1986 *pd++ = pack_1x64_32 (
1989 pix_multiply_1x64 (unpack_32_1x64 (m),
1990 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1994 /* call prefetch hint to optimize cache load*/
1995 cache_prefetch ((__m128i*)ps);
1996 cache_prefetch ((__m128i*)pd);
1997 cache_prefetch ((__m128i*)pm);
2001 /* fill cache line with next memory */
2002 cache_prefetch_next ((__m128i*)ps);
2003 cache_prefetch_next ((__m128i*)pd);
2004 cache_prefetch_next ((__m128i*)pm);
2006 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2007 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2008 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2010 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2011 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2012 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2014 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2015 &xmm_alpha_lo, &xmm_alpha_hi);
2016 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2017 &xmm_alpha_lo, &xmm_alpha_hi,
2018 &xmm_alpha_lo, &xmm_alpha_hi);
2020 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2021 &xmm_alpha_lo, &xmm_alpha_hi,
2022 &xmm_dst_lo, &xmm_dst_hi);
2025 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2039 *pd++ = pack_1x64_32 (
2042 pix_multiply_1x64 (unpack_32_1x64 (m),
2043 expand_alpha_1x64 (unpack_32_1x64 (s)))));
2048 static force_inline void
2049 core_combine_out_ca_sse2 (uint32_t * pd,
2056 __m128i xmm_alpha_lo, xmm_alpha_hi;
2057 __m128i xmm_src_lo, xmm_src_hi;
2058 __m128i xmm_dst_lo, xmm_dst_hi;
2059 __m128i xmm_mask_lo, xmm_mask_hi;
2061 /* call prefetch hint to optimize cache load*/
2062 cache_prefetch ((__m128i*)ps);
2063 cache_prefetch ((__m128i*)pd);
2064 cache_prefetch ((__m128i*)pm);
2066 while (w && (unsigned long)pd & 15)
2072 *pd++ = pack_1x64_32 (
2075 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2076 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2080 /* call prefetch hint to optimize cache load*/
2081 cache_prefetch ((__m128i*)ps);
2082 cache_prefetch ((__m128i*)pd);
2083 cache_prefetch ((__m128i*)pm);
2087 /* fill cache line with next memory */
2088 cache_prefetch_next ((__m128i*)ps);
2089 cache_prefetch_next ((__m128i*)pd);
2090 cache_prefetch_next ((__m128i*)pm);
2092 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2093 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2094 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2096 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2097 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2098 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2100 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2101 &xmm_alpha_lo, &xmm_alpha_hi);
2102 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2103 &xmm_alpha_lo, &xmm_alpha_hi);
2105 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2106 &xmm_mask_lo, &xmm_mask_hi,
2107 &xmm_dst_lo, &xmm_dst_hi);
2108 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2109 &xmm_alpha_lo, &xmm_alpha_hi,
2110 &xmm_dst_lo, &xmm_dst_hi);
2113 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2127 *pd++ = pack_1x64_32 (
2130 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2131 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2137 static force_inline void
2138 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
2145 __m128i xmm_alpha_lo, xmm_alpha_hi;
2146 __m128i xmm_src_lo, xmm_src_hi;
2147 __m128i xmm_dst_lo, xmm_dst_hi;
2148 __m128i xmm_mask_lo, xmm_mask_hi;
2150 /* call prefetch hint to optimize cache load*/
2151 cache_prefetch ((__m128i*)ps);
2152 cache_prefetch ((__m128i*)pd);
2153 cache_prefetch ((__m128i*)pm);
2155 while (w && (unsigned long)pd & 15)
2161 *pd++ = pack_1x64_32 (
2164 negate_1x64 (pix_multiply_1x64 (
2166 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2170 /* call prefetch hint to optimize cache load*/
2171 cache_prefetch ((__m128i*)ps);
2172 cache_prefetch ((__m128i*)pd);
2173 cache_prefetch ((__m128i*)pm);
2177 /* fill cache line with next memory */
2178 cache_prefetch_next ((__m128i*)ps);
2179 cache_prefetch_next ((__m128i*)pd);
2180 cache_prefetch_next ((__m128i*)pm);
2182 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2183 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2184 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2186 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2187 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2188 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2190 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2191 &xmm_alpha_lo, &xmm_alpha_hi);
2193 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2194 &xmm_alpha_lo, &xmm_alpha_hi,
2195 &xmm_mask_lo, &xmm_mask_hi);
2197 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2198 &xmm_mask_lo, &xmm_mask_hi);
2200 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2201 &xmm_mask_lo, &xmm_mask_hi,
2202 &xmm_dst_lo, &xmm_dst_hi);
2205 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2219 *pd++ = pack_1x64_32 (
2222 negate_1x64 (pix_multiply_1x64 (
2224 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2229 static force_inline uint32_t
2230 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2234 __m64 m = unpack_32_1x64 (mask);
2235 __m64 s = unpack_32_1x64 (src);
2236 __m64 d = unpack_32_1x64 (dst);
2237 __m64 sa = expand_alpha_1x64 (s);
2238 __m64 da = expand_alpha_1x64 (d);
2240 s = pix_multiply_1x64 (s, m);
2241 m = negate_1x64 (pix_multiply_1x64 (m, sa));
2243 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2246 static force_inline void
2247 core_combine_atop_ca_sse2 (uint32_t * pd,
2254 __m128i xmm_src_lo, xmm_src_hi;
2255 __m128i xmm_dst_lo, xmm_dst_hi;
2256 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2257 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2258 __m128i xmm_mask_lo, xmm_mask_hi;
2260 /* call prefetch hint to optimize cache load*/
2261 cache_prefetch ((__m128i*)ps);
2262 cache_prefetch ((__m128i*)pd);
2263 cache_prefetch ((__m128i*)pm);
2265 while (w && (unsigned long)pd & 15)
2271 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2275 /* call prefetch hint to optimize cache load*/
2276 cache_prefetch ((__m128i*)ps);
2277 cache_prefetch ((__m128i*)pd);
2278 cache_prefetch ((__m128i*)pm);
2282 /* fill cache line with next memory */
2283 cache_prefetch_next ((__m128i*)ps);
2284 cache_prefetch_next ((__m128i*)pd);
2285 cache_prefetch_next ((__m128i*)pm);
2287 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2288 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2289 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2291 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2292 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2293 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2295 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2296 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2297 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2298 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2300 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2301 &xmm_mask_lo, &xmm_mask_hi,
2302 &xmm_src_lo, &xmm_src_hi);
2303 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2304 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2305 &xmm_mask_lo, &xmm_mask_hi);
2307 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2309 pix_add_multiply_2x128 (
2310 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2311 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2312 &xmm_dst_lo, &xmm_dst_hi);
2315 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2329 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2334 static force_inline uint32_t
2335 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2339 __m64 m = unpack_32_1x64 (mask);
2340 __m64 s = unpack_32_1x64 (src);
2341 __m64 d = unpack_32_1x64 (dst);
2343 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2344 __m64 sa = expand_alpha_1x64 (s);
2346 s = pix_multiply_1x64 (s, m);
2347 m = pix_multiply_1x64 (m, sa);
2349 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2352 static force_inline void
2353 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2360 __m128i xmm_src_lo, xmm_src_hi;
2361 __m128i xmm_dst_lo, xmm_dst_hi;
2362 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2363 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2364 __m128i xmm_mask_lo, xmm_mask_hi;
2366 /* call prefetch hint to optimize cache load*/
2367 cache_prefetch ((__m128i*)ps);
2368 cache_prefetch ((__m128i*)pd);
2369 cache_prefetch ((__m128i*)pm);
2371 while (w && (unsigned long)pd & 15)
2377 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2381 /* call prefetch hint to optimize cache load*/
2382 cache_prefetch ((__m128i*)ps);
2383 cache_prefetch ((__m128i*)pd);
2384 cache_prefetch ((__m128i*)pm);
2388 /* fill cache line with next memory */
2389 cache_prefetch_next ((__m128i*)ps);
2390 cache_prefetch_next ((__m128i*)pd);
2391 cache_prefetch_next ((__m128i*)pm);
2393 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2394 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2395 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2397 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2398 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2399 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2401 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2402 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2403 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2404 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2406 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2407 &xmm_mask_lo, &xmm_mask_hi,
2408 &xmm_src_lo, &xmm_src_hi);
2409 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2410 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2411 &xmm_mask_lo, &xmm_mask_hi);
2413 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2414 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2416 pix_add_multiply_2x128 (
2417 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2418 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2419 &xmm_dst_lo, &xmm_dst_hi);
2422 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2436 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2441 static force_inline uint32_t
2442 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2446 __m64 a = unpack_32_1x64 (mask);
2447 __m64 s = unpack_32_1x64 (src);
2448 __m64 d = unpack_32_1x64 (dst);
2450 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2451 a, expand_alpha_1x64 (s)));
2452 __m64 dest = pix_multiply_1x64 (s, a);
2453 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2455 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2461 static force_inline void
2462 core_combine_xor_ca_sse2 (uint32_t * pd,
2469 __m128i xmm_src_lo, xmm_src_hi;
2470 __m128i xmm_dst_lo, xmm_dst_hi;
2471 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2472 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2473 __m128i xmm_mask_lo, xmm_mask_hi;
2475 /* call prefetch hint to optimize cache load*/
2476 cache_prefetch ((__m128i*)ps);
2477 cache_prefetch ((__m128i*)pd);
2478 cache_prefetch ((__m128i*)pm);
2480 while (w && (unsigned long)pd & 15)
2486 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2490 /* call prefetch hint to optimize cache load*/
2491 cache_prefetch ((__m128i*)ps);
2492 cache_prefetch ((__m128i*)pd);
2493 cache_prefetch ((__m128i*)pm);
2497 /* fill cache line with next memory */
2498 cache_prefetch_next ((__m128i*)ps);
2499 cache_prefetch_next ((__m128i*)pd);
2500 cache_prefetch_next ((__m128i*)pm);
2502 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2503 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2504 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2506 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2507 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2508 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2510 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2511 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2512 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2513 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2515 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2516 &xmm_mask_lo, &xmm_mask_hi,
2517 &xmm_src_lo, &xmm_src_hi);
2518 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2519 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2520 &xmm_mask_lo, &xmm_mask_hi);
2522 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2523 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2524 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2525 &xmm_mask_lo, &xmm_mask_hi);
2527 pix_add_multiply_2x128 (
2528 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2529 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2530 &xmm_dst_lo, &xmm_dst_hi);
2533 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2547 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2552 static force_inline void
2553 core_combine_add_ca_sse2 (uint32_t * pd,
2560 __m128i xmm_src_lo, xmm_src_hi;
2561 __m128i xmm_dst_lo, xmm_dst_hi;
2562 __m128i xmm_mask_lo, xmm_mask_hi;
2564 /* call prefetch hint to optimize cache load*/
2565 cache_prefetch ((__m128i*)ps);
2566 cache_prefetch ((__m128i*)pd);
2567 cache_prefetch ((__m128i*)pm);
2569 while (w && (unsigned long)pd & 15)
2575 *pd++ = pack_1x64_32 (
2576 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2577 unpack_32_1x64 (m)),
2578 unpack_32_1x64 (d)));
2582 /* call prefetch hint to optimize cache load*/
2583 cache_prefetch ((__m128i*)ps);
2584 cache_prefetch ((__m128i*)pd);
2585 cache_prefetch ((__m128i*)pm);
2589 /* fill cache line with next memory */
2590 cache_prefetch_next ((__m128i*)ps);
2591 cache_prefetch_next ((__m128i*)pd);
2592 cache_prefetch_next ((__m128i*)pm);
2594 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2595 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2596 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2598 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2599 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2600 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2602 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2603 &xmm_mask_lo, &xmm_mask_hi,
2604 &xmm_src_lo, &xmm_src_hi);
2607 (__m128i*)pd, pack_2x128_128 (
2608 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2609 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2623 *pd++ = pack_1x64_32 (
2624 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2625 unpack_32_1x64 (m)),
2626 unpack_32_1x64 (d)));
2631 /* ---------------------------------------------------
2632 * fb_compose_setup_sSE2
2634 static force_inline __m64
2635 create_mask_16_64 (uint16_t mask)
2637 return _mm_set1_pi16 (mask);
2640 static force_inline __m128i
2641 create_mask_16_128 (uint16_t mask)
2643 return _mm_set1_epi16 (mask);
2646 static force_inline __m64
2647 create_mask_2x32_64 (uint32_t mask0,
2650 return _mm_set_pi32 (mask0, mask1);
2653 /* Work around a code generation bug in Sun Studio 12. */
2654 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2655 # define create_mask_2x32_128(mask0, mask1) \
2656 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2658 static force_inline __m128i
2659 create_mask_2x32_128 (uint32_t mask0,
2662 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2666 /* SSE2 code patch for fbcompose.c */
2669 sse2_combine_over_u (pixman_implementation_t *imp,
2672 const uint32_t * src,
2673 const uint32_t * mask,
2676 core_combine_over_u_sse2 (dst, src, mask, width);
2681 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2684 const uint32_t * src,
2685 const uint32_t * mask,
2688 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2693 sse2_combine_in_u (pixman_implementation_t *imp,
2696 const uint32_t * src,
2697 const uint32_t * mask,
2700 core_combine_in_u_sse2 (dst, src, mask, width);
2705 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2708 const uint32_t * src,
2709 const uint32_t * mask,
2712 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2717 sse2_combine_out_u (pixman_implementation_t *imp,
2720 const uint32_t * src,
2721 const uint32_t * mask,
2724 core_combine_out_u_sse2 (dst, src, mask, width);
2729 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2732 const uint32_t * src,
2733 const uint32_t * mask,
2736 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2741 sse2_combine_atop_u (pixman_implementation_t *imp,
2744 const uint32_t * src,
2745 const uint32_t * mask,
2748 core_combine_atop_u_sse2 (dst, src, mask, width);
2753 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2756 const uint32_t * src,
2757 const uint32_t * mask,
2760 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2765 sse2_combine_xor_u (pixman_implementation_t *imp,
2768 const uint32_t * src,
2769 const uint32_t * mask,
2772 core_combine_xor_u_sse2 (dst, src, mask, width);
2777 sse2_combine_add_u (pixman_implementation_t *imp,
2780 const uint32_t * src,
2781 const uint32_t * mask,
2784 core_combine_add_u_sse2 (dst, src, mask, width);
2789 sse2_combine_saturate_u (pixman_implementation_t *imp,
2792 const uint32_t * src,
2793 const uint32_t * mask,
2796 core_combine_saturate_u_sse2 (dst, src, mask, width);
2801 sse2_combine_src_ca (pixman_implementation_t *imp,
2804 const uint32_t * src,
2805 const uint32_t * mask,
2808 core_combine_src_ca_sse2 (dst, src, mask, width);
2813 sse2_combine_over_ca (pixman_implementation_t *imp,
2816 const uint32_t * src,
2817 const uint32_t * mask,
2820 core_combine_over_ca_sse2 (dst, src, mask, width);
2825 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2828 const uint32_t * src,
2829 const uint32_t * mask,
2832 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2837 sse2_combine_in_ca (pixman_implementation_t *imp,
2840 const uint32_t * src,
2841 const uint32_t * mask,
2844 core_combine_in_ca_sse2 (dst, src, mask, width);
2849 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2852 const uint32_t * src,
2853 const uint32_t * mask,
2856 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2861 sse2_combine_out_ca (pixman_implementation_t *imp,
2864 const uint32_t * src,
2865 const uint32_t * mask,
2868 core_combine_out_ca_sse2 (dst, src, mask, width);
2873 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2876 const uint32_t * src,
2877 const uint32_t * mask,
2880 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2885 sse2_combine_atop_ca (pixman_implementation_t *imp,
2888 const uint32_t * src,
2889 const uint32_t * mask,
2892 core_combine_atop_ca_sse2 (dst, src, mask, width);
2897 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2900 const uint32_t * src,
2901 const uint32_t * mask,
2904 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2909 sse2_combine_xor_ca (pixman_implementation_t *imp,
2912 const uint32_t * src,
2913 const uint32_t * mask,
2916 core_combine_xor_ca_sse2 (dst, src, mask, width);
2921 sse2_combine_add_ca (pixman_implementation_t *imp,
2924 const uint32_t * src,
2925 const uint32_t * mask,
2928 core_combine_add_ca_sse2 (dst, src, mask, width);
2932 /* -------------------------------------------------------------------
2933 * composite_over_n_8888
2937 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2939 pixman_image_t * src_image,
2940 pixman_image_t * mask_image,
2941 pixman_image_t * dst_image,
2952 uint32_t *dst_line, *dst, d;
2955 __m128i xmm_src, xmm_alpha;
2956 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2958 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2963 PIXMAN_IMAGE_GET_LINE (
2964 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2966 xmm_src = expand_pixel_32_1x128 (src);
2967 xmm_alpha = expand_alpha_1x128 (xmm_src);
2973 /* call prefetch hint to optimize cache load*/
2974 cache_prefetch ((__m128i*)dst);
2976 dst_line += dst_stride;
2979 while (w && (unsigned long)dst & 15)
2982 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2983 _mm_movepi64_pi64 (xmm_alpha),
2984 unpack_32_1x64 (d)));
2988 cache_prefetch ((__m128i*)dst);
2992 /* fill cache line with next memory */
2993 cache_prefetch_next ((__m128i*)dst);
2995 xmm_dst = load_128_aligned ((__m128i*)dst);
2997 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2999 over_2x128 (&xmm_src, &xmm_src,
3000 &xmm_alpha, &xmm_alpha,
3001 &xmm_dst_lo, &xmm_dst_hi);
3003 /* rebuid the 4 pixel data and save*/
3005 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3014 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3015 _mm_movepi64_pi64 (xmm_alpha),
3016 unpack_32_1x64 (d)));
3024 /* ---------------------------------------------------------------------
3025 * composite_over_n_0565
3028 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3030 pixman_image_t * src_image,
3031 pixman_image_t * mask_image,
3032 pixman_image_t * dst_image,
3043 uint16_t *dst_line, *dst, d;
3046 __m128i xmm_src, xmm_alpha;
3047 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3049 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3054 PIXMAN_IMAGE_GET_LINE (
3055 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3057 xmm_src = expand_pixel_32_1x128 (src);
3058 xmm_alpha = expand_alpha_1x128 (xmm_src);
3064 /* call prefetch hint to optimize cache load*/
3065 cache_prefetch ((__m128i*)dst);
3067 dst_line += dst_stride;
3070 while (w && (unsigned long)dst & 15)
3074 *dst++ = pack_565_32_16 (
3075 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3076 _mm_movepi64_pi64 (xmm_alpha),
3077 expand565_16_1x64 (d))));
3081 /* call prefetch hint to optimize cache load*/
3082 cache_prefetch ((__m128i*)dst);
3086 /* fill cache line with next memory */
3087 cache_prefetch_next ((__m128i*)dst);
3089 xmm_dst = load_128_aligned ((__m128i*)dst);
3091 unpack_565_128_4x128 (xmm_dst,
3092 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3094 over_2x128 (&xmm_src, &xmm_src,
3095 &xmm_alpha, &xmm_alpha,
3096 &xmm_dst0, &xmm_dst1);
3097 over_2x128 (&xmm_src, &xmm_src,
3098 &xmm_alpha, &xmm_alpha,
3099 &xmm_dst2, &xmm_dst3);
3101 xmm_dst = pack_565_4x128_128 (
3102 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3104 save_128_aligned ((__m128i*)dst, xmm_dst);
3113 *dst++ = pack_565_32_16 (
3114 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3115 _mm_movepi64_pi64 (xmm_alpha),
3116 expand565_16_1x64 (d))));
3123 /* ------------------------------
3124 * composite_add_n_8888_8888_ca
3127 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3129 pixman_image_t * src_image,
3130 pixman_image_t * mask_image,
3131 pixman_image_t * dst_image,
3142 uint32_t *dst_line, d;
3143 uint32_t *mask_line, m;
3145 int dst_stride, mask_stride;
3147 __m128i xmm_src, xmm_alpha;
3149 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3151 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3153 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3159 PIXMAN_IMAGE_GET_LINE (
3160 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3161 PIXMAN_IMAGE_GET_LINE (
3162 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3164 xmm_src = _mm_unpacklo_epi8 (
3165 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3166 xmm_alpha = expand_alpha_1x128 (xmm_src);
3167 mmx_src = _mm_movepi64_pi64 (xmm_src);
3168 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3173 const uint32_t *pm = (uint32_t *)mask_line;
3174 uint32_t *pd = (uint32_t *)dst_line;
3176 dst_line += dst_stride;
3177 mask_line += mask_stride;
3179 /* call prefetch hint to optimize cache load*/
3180 cache_prefetch ((__m128i*)pd);
3181 cache_prefetch ((__m128i*)pm);
3183 while (w && (unsigned long)pd & 15)
3191 mmx_mask = unpack_32_1x64 (m);
3192 mmx_dest = unpack_32_1x64 (d);
3194 *pd = pack_1x64_32 (
3195 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3202 /* call prefetch hint to optimize cache load*/
3203 cache_prefetch ((__m128i*)pd);
3204 cache_prefetch ((__m128i*)pm);
3208 /* fill cache line with next memory */
3209 cache_prefetch_next ((__m128i*)pd);
3210 cache_prefetch_next ((__m128i*)pm);
3212 xmm_mask = load_128_unaligned ((__m128i*)pm);
3216 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3218 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3219 if (pack_cmp != 0xffff)
3221 xmm_dst = load_128_aligned ((__m128i*)pd);
3223 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3225 pix_multiply_2x128 (&xmm_src, &xmm_src,
3226 &xmm_mask_lo, &xmm_mask_hi,
3227 &xmm_mask_lo, &xmm_mask_hi);
3228 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3231 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3247 mmx_mask = unpack_32_1x64 (m);
3248 mmx_dest = unpack_32_1x64 (d);
3250 *pd = pack_1x64_32 (
3251 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3262 /* ---------------------------------------------------------------------------
3263 * composite_over_n_8888_8888_ca
3267 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3269 pixman_image_t * src_image,
3270 pixman_image_t * mask_image,
3271 pixman_image_t * dst_image,
3282 uint32_t *dst_line, d;
3283 uint32_t *mask_line, m;
3285 int dst_stride, mask_stride;
3287 __m128i xmm_src, xmm_alpha;
3288 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3289 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3291 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3293 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3298 PIXMAN_IMAGE_GET_LINE (
3299 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3300 PIXMAN_IMAGE_GET_LINE (
3301 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3303 xmm_src = _mm_unpacklo_epi8 (
3304 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3305 xmm_alpha = expand_alpha_1x128 (xmm_src);
3306 mmx_src = _mm_movepi64_pi64 (xmm_src);
3307 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3312 const uint32_t *pm = (uint32_t *)mask_line;
3313 uint32_t *pd = (uint32_t *)dst_line;
3315 dst_line += dst_stride;
3316 mask_line += mask_stride;
3318 /* call prefetch hint to optimize cache load*/
3319 cache_prefetch ((__m128i*)pd);
3320 cache_prefetch ((__m128i*)pm);
3322 while (w && (unsigned long)pd & 15)
3329 mmx_mask = unpack_32_1x64 (m);
3330 mmx_dest = unpack_32_1x64 (d);
3332 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3342 /* call prefetch hint to optimize cache load*/
3343 cache_prefetch ((__m128i*)pd);
3344 cache_prefetch ((__m128i*)pm);
3348 /* fill cache line with next memory */
3349 cache_prefetch_next ((__m128i*)pd);
3350 cache_prefetch_next ((__m128i*)pm);
3352 xmm_mask = load_128_unaligned ((__m128i*)pm);
3356 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3358 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3359 if (pack_cmp != 0xffff)
3361 xmm_dst = load_128_aligned ((__m128i*)pd);
3363 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3364 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3366 in_over_2x128 (&xmm_src, &xmm_src,
3367 &xmm_alpha, &xmm_alpha,
3368 &xmm_mask_lo, &xmm_mask_hi,
3369 &xmm_dst_lo, &xmm_dst_hi);
3372 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3387 mmx_mask = unpack_32_1x64 (m);
3388 mmx_dest = unpack_32_1x64 (d);
3390 *pd = pack_1x64_32 (
3391 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3402 /*---------------------------------------------------------------------
3403 * composite_over_8888_n_8888
3407 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3409 pixman_image_t * src_image,
3410 pixman_image_t * mask_image,
3411 pixman_image_t * dst_image,
3421 uint32_t *dst_line, *dst;
3422 uint32_t *src_line, *src;
3425 int dst_stride, src_stride;
3428 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3429 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3430 __m128i xmm_alpha_lo, xmm_alpha_hi;
3432 PIXMAN_IMAGE_GET_LINE (
3433 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3434 PIXMAN_IMAGE_GET_LINE (
3435 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3437 mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3439 xmm_mask = create_mask_16_128 (mask >> 24);
3444 dst_line += dst_stride;
3446 src_line += src_stride;
3449 /* call prefetch hint to optimize cache load*/
3450 cache_prefetch ((__m128i*)dst);
3451 cache_prefetch ((__m128i*)src);
3453 while (w && (unsigned long)dst & 15)
3455 uint32_t s = *src++;
3458 __m64 ms = unpack_32_1x64 (s);
3459 __m64 alpha = expand_alpha_1x64 (ms);
3460 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3461 __m64 alpha_dst = unpack_32_1x64 (d);
3463 *dst++ = pack_1x64_32 (
3464 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3469 /* call prefetch hint to optimize cache load*/
3470 cache_prefetch ((__m128i*)dst);
3471 cache_prefetch ((__m128i*)src);
3475 /* fill cache line with next memory */
3476 cache_prefetch_next ((__m128i*)dst);
3477 cache_prefetch_next ((__m128i*)src);
3479 xmm_src = load_128_unaligned ((__m128i*)src);
3480 xmm_dst = load_128_aligned ((__m128i*)dst);
3482 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3483 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3484 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3485 &xmm_alpha_lo, &xmm_alpha_hi);
3487 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3488 &xmm_alpha_lo, &xmm_alpha_hi,
3489 &xmm_mask, &xmm_mask,
3490 &xmm_dst_lo, &xmm_dst_hi);
3493 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3502 uint32_t s = *src++;
3505 __m64 ms = unpack_32_1x64 (s);
3506 __m64 alpha = expand_alpha_1x64 (ms);
3507 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3508 __m64 dest = unpack_32_1x64 (d);
3510 *dst++ = pack_1x64_32 (
3511 in_over_1x64 (&ms, &alpha, &mask, &dest));
3520 /* ---------------------------------------------------------------------
3521 * composite_over_x888_n_8888
3524 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3526 pixman_image_t * src_image,
3527 pixman_image_t * mask_image,
3528 pixman_image_t * dst_image,
3538 uint32_t *dst_line, *dst;
3539 uint32_t *src_line, *src;
3541 int dst_stride, src_stride;
3544 __m128i xmm_mask, xmm_alpha;
3545 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3546 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3548 PIXMAN_IMAGE_GET_LINE (
3549 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3550 PIXMAN_IMAGE_GET_LINE (
3551 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3553 mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3555 xmm_mask = create_mask_16_128 (mask >> 24);
3556 xmm_alpha = mask_00ff;
3561 dst_line += dst_stride;
3563 src_line += src_stride;
3566 /* call prefetch hint to optimize cache load*/
3567 cache_prefetch ((__m128i*)dst);
3568 cache_prefetch ((__m128i*)src);
3570 while (w && (unsigned long)dst & 15)
3572 uint32_t s = (*src++) | 0xff000000;
3575 __m64 src = unpack_32_1x64 (s);
3576 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3577 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3578 __m64 dest = unpack_32_1x64 (d);
3580 *dst++ = pack_1x64_32 (
3581 in_over_1x64 (&src, &alpha, &mask, &dest));
3586 /* call prefetch hint to optimize cache load*/
3587 cache_prefetch ((__m128i*)dst);
3588 cache_prefetch ((__m128i*)src);
3592 /* fill cache line with next memory */
3593 cache_prefetch_next ((__m128i*)dst);
3594 cache_prefetch_next ((__m128i*)src);
3596 xmm_src = _mm_or_si128 (
3597 load_128_unaligned ((__m128i*)src), mask_ff000000);
3598 xmm_dst = load_128_aligned ((__m128i*)dst);
3600 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3601 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3603 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3604 &xmm_alpha, &xmm_alpha,
3605 &xmm_mask, &xmm_mask,
3606 &xmm_dst_lo, &xmm_dst_hi);
3609 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3619 uint32_t s = (*src++) | 0xff000000;
3622 __m64 src = unpack_32_1x64 (s);
3623 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3624 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3625 __m64 dest = unpack_32_1x64 (d);
3627 *dst++ = pack_1x64_32 (
3628 in_over_1x64 (&src, &alpha, &mask, &dest));
3637 /* --------------------------------------------------------------------
3638 * composite_over_8888_8888
3641 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3643 pixman_image_t * src_image,
3644 pixman_image_t * mask_image,
3645 pixman_image_t * dst_image,
3655 int dst_stride, src_stride;
3656 uint32_t *dst_line, *dst;
3657 uint32_t *src_line, *src;
3659 PIXMAN_IMAGE_GET_LINE (
3660 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3661 PIXMAN_IMAGE_GET_LINE (
3662 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3669 core_combine_over_u_sse2 (dst, src, NULL, width);
3677 /* ------------------------------------------------------------------
3678 * composite_over_8888_0565
3680 static force_inline uint16_t
3681 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3685 ms = unpack_32_1x64 (src);
3686 return pack_565_32_16 (
3689 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3693 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3695 pixman_image_t * src_image,
3696 pixman_image_t * mask_image,
3697 pixman_image_t * dst_image,
3707 uint16_t *dst_line, *dst, d;
3708 uint32_t *src_line, *src, s;
3709 int dst_stride, src_stride;
3712 __m128i xmm_alpha_lo, xmm_alpha_hi;
3713 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3714 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3716 PIXMAN_IMAGE_GET_LINE (
3717 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3718 PIXMAN_IMAGE_GET_LINE (
3719 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3724 * I copy the code from MMX one and keep the fixme.
3725 * If it's a problem there, probably is a problem here.
3727 assert (src_image->drawable == mask_image->drawable);
3735 /* call prefetch hint to optimize cache load*/
3736 cache_prefetch ((__m128i*)src);
3737 cache_prefetch ((__m128i*)dst);
3739 dst_line += dst_stride;
3740 src_line += src_stride;
3743 /* Align dst on a 16-byte boundary */
3745 ((unsigned long)dst & 15))
3750 *dst++ = composite_over_8888_0565pixel (s, d);
3754 /* call prefetch hint to optimize cache load*/
3755 cache_prefetch ((__m128i*)src);
3756 cache_prefetch ((__m128i*)dst);
3758 /* It's a 8 pixel loop */
3761 /* fill cache line with next memory */
3762 cache_prefetch_next ((__m128i*)src);
3763 cache_prefetch_next ((__m128i*)dst);
3765 /* I'm loading unaligned because I'm not sure
3766 * about the address alignment.
3768 xmm_src = load_128_unaligned ((__m128i*) src);
3769 xmm_dst = load_128_aligned ((__m128i*) dst);
3772 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3773 unpack_565_128_4x128 (xmm_dst,
3774 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3775 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3776 &xmm_alpha_lo, &xmm_alpha_hi);
3778 /* I'm loading next 4 pixels from memory
3779 * before to optimze the memory read.
3781 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3783 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3784 &xmm_alpha_lo, &xmm_alpha_hi,
3785 &xmm_dst0, &xmm_dst1);
3788 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3789 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3790 &xmm_alpha_lo, &xmm_alpha_hi);
3792 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3793 &xmm_alpha_lo, &xmm_alpha_hi,
3794 &xmm_dst2, &xmm_dst3);
3797 (__m128i*)dst, pack_565_4x128_128 (
3798 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3810 *dst++ = composite_over_8888_0565pixel (s, d);
3817 /* -----------------------------------------------------------------
3818 * composite_over_n_8_8888
3822 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3824 pixman_image_t * src_image,
3825 pixman_image_t * mask_image,
3826 pixman_image_t * dst_image,
3837 uint32_t *dst_line, *dst;
3838 uint8_t *mask_line, *mask;
3839 int dst_stride, mask_stride;
3843 __m128i xmm_src, xmm_alpha, xmm_def;
3844 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3845 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3847 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3849 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3855 PIXMAN_IMAGE_GET_LINE (
3856 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3857 PIXMAN_IMAGE_GET_LINE (
3858 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3860 xmm_def = create_mask_2x32_128 (src, src);
3861 xmm_src = expand_pixel_32_1x128 (src);
3862 xmm_alpha = expand_alpha_1x128 (xmm_src);
3863 mmx_src = _mm_movepi64_pi64 (xmm_src);
3864 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3869 dst_line += dst_stride;
3871 mask_line += mask_stride;
3874 /* call prefetch hint to optimize cache load*/
3875 cache_prefetch ((__m128i*)mask);
3876 cache_prefetch ((__m128i*)dst);
3878 while (w && (unsigned long)dst & 15)
3880 uint8_t m = *mask++;
3885 mmx_mask = expand_pixel_8_1x64 (m);
3886 mmx_dest = unpack_32_1x64 (d);
3888 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3898 /* call prefetch hint to optimize cache load*/
3899 cache_prefetch ((__m128i*)mask);
3900 cache_prefetch ((__m128i*)dst);
3904 /* fill cache line with next memory */
3905 cache_prefetch_next ((__m128i*)mask);
3906 cache_prefetch_next ((__m128i*)dst);
3908 m = *((uint32_t*)mask);
3910 if (srca == 0xff && m == 0xffffffff)
3912 save_128_aligned ((__m128i*)dst, xmm_def);
3916 xmm_dst = load_128_aligned ((__m128i*) dst);
3917 xmm_mask = unpack_32_1x128 (m);
3918 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3921 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3922 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3924 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3925 &xmm_mask_lo, &xmm_mask_hi);
3927 in_over_2x128 (&xmm_src, &xmm_src,
3928 &xmm_alpha, &xmm_alpha,
3929 &xmm_mask_lo, &xmm_mask_hi,
3930 &xmm_dst_lo, &xmm_dst_hi);
3933 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3943 uint8_t m = *mask++;
3948 mmx_mask = expand_pixel_8_1x64 (m);
3949 mmx_dest = unpack_32_1x64 (d);
3951 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3965 /* ----------------------------------------------------------------
3966 * composite_over_n_8_8888
3970 pixman_fill_sse2 (uint32_t *bits,
3979 uint32_t byte_width;
3989 stride = stride * (int) sizeof (uint32_t) / 1;
3990 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3996 data = (w << 16) | w;
4000 stride = stride * (int) sizeof (uint32_t) / 2;
4001 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
4002 byte_width = 2 * width;
4005 data = (data & 0xffff) * 0x00010001;
4009 stride = stride * (int) sizeof (uint32_t) / 4;
4010 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
4011 byte_width = 4 * width;
4019 cache_prefetch ((__m128i*)byte_line);
4020 xmm_def = create_mask_2x32_128 (data, data);
4025 uint8_t *d = byte_line;
4026 byte_line += stride;
4029 cache_prefetch_next ((__m128i*)d);
4031 while (w >= 1 && ((unsigned long)d & 1))
4033 *(uint8_t *)d = data;
4038 while (w >= 2 && ((unsigned long)d & 3))
4040 *(uint16_t *)d = data;
4045 while (w >= 4 && ((unsigned long)d & 15))
4047 *(uint32_t *)d = data;
4053 cache_prefetch_next ((__m128i*)d);
4057 cache_prefetch (((__m128i*)d) + 12);
4059 save_128_aligned ((__m128i*)(d), xmm_def);
4060 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4061 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4062 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4063 save_128_aligned ((__m128i*)(d + 64), xmm_def);
4064 save_128_aligned ((__m128i*)(d + 80), xmm_def);
4065 save_128_aligned ((__m128i*)(d + 96), xmm_def);
4066 save_128_aligned ((__m128i*)(d + 112), xmm_def);
4074 cache_prefetch (((__m128i*)d) + 8);
4076 save_128_aligned ((__m128i*)(d), xmm_def);
4077 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4078 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4079 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4085 cache_prefetch_next ((__m128i*)d);
4089 save_128_aligned ((__m128i*)(d), xmm_def);
4090 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4098 save_128_aligned ((__m128i*)(d), xmm_def);
4104 cache_prefetch_next ((__m128i*)d);
4108 *(uint32_t *)d = data;
4116 *(uint16_t *)d = data;
4123 *(uint8_t *)d = data;
4134 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4136 pixman_image_t * src_image,
4137 pixman_image_t * mask_image,
4138 pixman_image_t * dst_image,
4149 uint32_t *dst_line, *dst;
4150 uint8_t *mask_line, *mask;
4151 int dst_stride, mask_stride;
4155 __m128i xmm_src, xmm_def;
4156 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4158 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4163 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4164 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4165 dest_x, dest_y, width, height, 0);
4169 PIXMAN_IMAGE_GET_LINE (
4170 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4171 PIXMAN_IMAGE_GET_LINE (
4172 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4174 xmm_def = create_mask_2x32_128 (src, src);
4175 xmm_src = expand_pixel_32_1x128 (src);
4180 dst_line += dst_stride;
4182 mask_line += mask_stride;
4185 /* call prefetch hint to optimize cache load*/
4186 cache_prefetch ((__m128i*)mask);
4187 cache_prefetch ((__m128i*)dst);
4189 while (w && (unsigned long)dst & 15)
4191 uint8_t m = *mask++;
4195 *dst = pack_1x64_32 (
4197 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4208 /* call prefetch hint to optimize cache load*/
4209 cache_prefetch ((__m128i*)mask);
4210 cache_prefetch ((__m128i*)dst);
4214 /* fill cache line with next memory */
4215 cache_prefetch_next ((__m128i*)mask);
4216 cache_prefetch_next ((__m128i*)dst);
4218 m = *((uint32_t*)mask);
4220 if (srca == 0xff && m == 0xffffffff)
4222 save_128_aligned ((__m128i*)dst, xmm_def);
4226 xmm_mask = unpack_32_1x128 (m);
4227 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4230 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4232 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4233 &xmm_mask_lo, &xmm_mask_hi);
4235 pix_multiply_2x128 (&xmm_src, &xmm_src,
4236 &xmm_mask_lo, &xmm_mask_hi,
4237 &xmm_mask_lo, &xmm_mask_hi);
4240 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4244 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4254 uint8_t m = *mask++;
4258 *dst = pack_1x64_32 (
4260 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4275 /*-----------------------------------------------------------------------
4276 * composite_over_n_8_0565
4280 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4282 pixman_image_t * src_image,
4283 pixman_image_t * mask_image,
4284 pixman_image_t * dst_image,
4295 uint16_t *dst_line, *dst, d;
4296 uint8_t *mask_line, *mask;
4297 int dst_stride, mask_stride;
4300 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4302 __m128i xmm_src, xmm_alpha;
4303 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4304 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4306 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4312 PIXMAN_IMAGE_GET_LINE (
4313 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4314 PIXMAN_IMAGE_GET_LINE (
4315 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4317 xmm_src = expand_pixel_32_1x128 (src);
4318 xmm_alpha = expand_alpha_1x128 (xmm_src);
4319 mmx_src = _mm_movepi64_pi64 (xmm_src);
4320 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4325 dst_line += dst_stride;
4327 mask_line += mask_stride;
4330 /* call prefetch hint to optimize cache load*/
4331 cache_prefetch ((__m128i*)mask);
4332 cache_prefetch ((__m128i*)dst);
4334 while (w && (unsigned long)dst & 15)
4341 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4342 mmx_dest = expand565_16_1x64 (d);
4344 *dst = pack_565_32_16 (
4347 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4354 /* call prefetch hint to optimize cache load*/
4355 cache_prefetch ((__m128i*)mask);
4356 cache_prefetch ((__m128i*)dst);
4360 /* fill cache line with next memory */
4361 cache_prefetch_next ((__m128i*)mask);
4362 cache_prefetch_next ((__m128i*)dst);
4364 xmm_dst = load_128_aligned ((__m128i*) dst);
4365 unpack_565_128_4x128 (xmm_dst,
4366 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4368 m = *((uint32_t*)mask);
4373 xmm_mask = unpack_32_1x128 (m);
4374 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4377 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4379 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4380 &xmm_mask_lo, &xmm_mask_hi);
4382 in_over_2x128 (&xmm_src, &xmm_src,
4383 &xmm_alpha, &xmm_alpha,
4384 &xmm_mask_lo, &xmm_mask_hi,
4385 &xmm_dst0, &xmm_dst1);
4388 m = *((uint32_t*)mask);
4393 xmm_mask = unpack_32_1x128 (m);
4394 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4397 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4399 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4400 &xmm_mask_lo, &xmm_mask_hi);
4401 in_over_2x128 (&xmm_src, &xmm_src,
4402 &xmm_alpha, &xmm_alpha,
4403 &xmm_mask_lo, &xmm_mask_hi,
4404 &xmm_dst2, &xmm_dst3);
4408 (__m128i*)dst, pack_565_4x128_128 (
4409 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4422 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4423 mmx_dest = expand565_16_1x64 (d);
4425 *dst = pack_565_32_16 (
4428 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4439 /* -----------------------------------------------------------------------
4440 * composite_over_pixbuf_0565
4444 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4446 pixman_image_t * src_image,
4447 pixman_image_t * mask_image,
4448 pixman_image_t * dst_image,
4458 uint16_t *dst_line, *dst, d;
4459 uint32_t *src_line, *src, s;
4460 int dst_stride, src_stride;
4462 uint32_t opaque, zero;
4465 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4466 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4468 PIXMAN_IMAGE_GET_LINE (
4469 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4470 PIXMAN_IMAGE_GET_LINE (
4471 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4476 * I copy the code from MMX one and keep the fixme.
4477 * If it's a problem there, probably is a problem here.
4479 assert (src_image->drawable == mask_image->drawable);
4485 dst_line += dst_stride;
4487 src_line += src_stride;
4490 /* call prefetch hint to optimize cache load*/
4491 cache_prefetch ((__m128i*)src);
4492 cache_prefetch ((__m128i*)dst);
4494 while (w && (unsigned long)dst & 15)
4499 ms = unpack_32_1x64 (s);
4501 *dst++ = pack_565_32_16 (
4503 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4507 /* call prefetch hint to optimize cache load*/
4508 cache_prefetch ((__m128i*)src);
4509 cache_prefetch ((__m128i*)dst);
4513 /* fill cache line with next memory */
4514 cache_prefetch_next ((__m128i*)src);
4515 cache_prefetch_next ((__m128i*)dst);
4518 xmm_src = load_128_unaligned ((__m128i*)src);
4519 xmm_dst = load_128_aligned ((__m128i*)dst);
4521 opaque = is_opaque (xmm_src);
4522 zero = is_zero (xmm_src);
4524 unpack_565_128_4x128 (xmm_dst,
4525 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4526 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4528 /* preload next round*/
4529 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4533 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4534 &xmm_dst0, &xmm_dst1);
4538 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4539 &xmm_dst0, &xmm_dst1);
4543 opaque = is_opaque (xmm_src);
4544 zero = is_zero (xmm_src);
4546 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4550 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4551 &xmm_dst2, &xmm_dst3);
4555 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4556 &xmm_dst2, &xmm_dst3);
4560 (__m128i*)dst, pack_565_4x128_128 (
4561 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4573 ms = unpack_32_1x64 (s);
4575 *dst++ = pack_565_32_16 (
4577 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4585 /* -------------------------------------------------------------------------
4586 * composite_over_pixbuf_8888
4590 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4592 pixman_image_t * src_image,
4593 pixman_image_t * mask_image,
4594 pixman_image_t * dst_image,
4604 uint32_t *dst_line, *dst, d;
4605 uint32_t *src_line, *src, s;
4606 int dst_stride, src_stride;
4608 uint32_t opaque, zero;
4610 __m128i xmm_src_lo, xmm_src_hi;
4611 __m128i xmm_dst_lo, xmm_dst_hi;
4613 PIXMAN_IMAGE_GET_LINE (
4614 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4615 PIXMAN_IMAGE_GET_LINE (
4616 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4621 * I copy the code from MMX one and keep the fixme.
4622 * If it's a problem there, probably is a problem here.
4624 assert (src_image->drawable == mask_image->drawable);
4630 dst_line += dst_stride;
4632 src_line += src_stride;
4635 /* call prefetch hint to optimize cache load*/
4636 cache_prefetch ((__m128i*)src);
4637 cache_prefetch ((__m128i*)dst);
4639 while (w && (unsigned long)dst & 15)
4644 *dst++ = pack_1x64_32 (
4645 over_rev_non_pre_1x64 (
4646 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4651 /* call prefetch hint to optimize cache load*/
4652 cache_prefetch ((__m128i*)src);
4653 cache_prefetch ((__m128i*)dst);
4657 /* fill cache line with next memory */
4658 cache_prefetch_next ((__m128i*)src);
4659 cache_prefetch_next ((__m128i*)dst);
4661 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4663 opaque = is_opaque (xmm_src_hi);
4664 zero = is_zero (xmm_src_hi);
4666 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4670 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4671 &xmm_dst_lo, &xmm_dst_hi);
4674 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4678 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4680 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4682 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4683 &xmm_dst_lo, &xmm_dst_hi);
4686 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4699 *dst++ = pack_1x64_32 (
4700 over_rev_non_pre_1x64 (
4701 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4710 /* -------------------------------------------------------------------------------------------------
4711 * composite_over_n_8888_0565_ca
4715 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4717 pixman_image_t * src_image,
4718 pixman_image_t * mask_image,
4719 pixman_image_t * dst_image,
4730 uint16_t *dst_line, *dst, d;
4731 uint32_t *mask_line, *mask, m;
4732 int dst_stride, mask_stride;
4736 __m128i xmm_src, xmm_alpha;
4737 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4738 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4740 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4742 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4747 PIXMAN_IMAGE_GET_LINE (
4748 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4749 PIXMAN_IMAGE_GET_LINE (
4750 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4752 xmm_src = expand_pixel_32_1x128 (src);
4753 xmm_alpha = expand_alpha_1x128 (xmm_src);
4754 mmx_src = _mm_movepi64_pi64 (xmm_src);
4755 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4762 mask_line += mask_stride;
4763 dst_line += dst_stride;
4765 /* call prefetch hint to optimize cache load*/
4766 cache_prefetch ((__m128i*)mask);
4767 cache_prefetch ((__m128i*)dst);
4769 while (w && ((unsigned long)dst & 15))
4771 m = *(uint32_t *) mask;
4776 mmx_mask = unpack_32_1x64 (m);
4777 mmx_dest = expand565_16_1x64 (d);
4779 *dst = pack_565_32_16 (
4782 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4790 /* call prefetch hint to optimize cache load*/
4791 cache_prefetch ((__m128i*)mask);
4792 cache_prefetch ((__m128i*)dst);
4796 /* fill cache line with next memory */
4797 cache_prefetch_next ((__m128i*)mask);
4798 cache_prefetch_next ((__m128i*)dst);
4801 xmm_mask = load_128_unaligned ((__m128i*)mask);
4802 xmm_dst = load_128_aligned ((__m128i*)dst);
4804 pack_cmp = _mm_movemask_epi8 (
4805 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4807 unpack_565_128_4x128 (xmm_dst,
4808 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4809 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4811 /* preload next round */
4812 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4814 /* preload next round */
4815 if (pack_cmp != 0xffff)
4817 in_over_2x128 (&xmm_src, &xmm_src,
4818 &xmm_alpha, &xmm_alpha,
4819 &xmm_mask_lo, &xmm_mask_hi,
4820 &xmm_dst0, &xmm_dst1);
4824 pack_cmp = _mm_movemask_epi8 (
4825 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4827 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4829 if (pack_cmp != 0xffff)
4831 in_over_2x128 (&xmm_src, &xmm_src,
4832 &xmm_alpha, &xmm_alpha,
4833 &xmm_mask_lo, &xmm_mask_hi,
4834 &xmm_dst2, &xmm_dst3);
4838 (__m128i*)dst, pack_565_4x128_128 (
4839 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4848 m = *(uint32_t *) mask;
4853 mmx_mask = unpack_32_1x64 (m);
4854 mmx_dest = expand565_16_1x64 (d);
4856 *dst = pack_565_32_16 (
4859 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4871 /* -----------------------------------------------------------------------
4872 * composite_in_n_8_8
4876 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4878 pixman_image_t * src_image,
4879 pixman_image_t * mask_image,
4880 pixman_image_t * dst_image,
4890 uint8_t *dst_line, *dst;
4891 uint8_t *mask_line, *mask;
4892 int dst_stride, mask_stride;
4899 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4900 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4902 PIXMAN_IMAGE_GET_LINE (
4903 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4904 PIXMAN_IMAGE_GET_LINE (
4905 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4907 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4911 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4916 dst_line += dst_stride;
4918 mask_line += mask_stride;
4921 /* call prefetch hint to optimize cache load*/
4922 cache_prefetch ((__m128i*)mask);
4923 cache_prefetch ((__m128i*)dst);
4925 while (w && ((unsigned long)dst & 15))
4927 m = (uint32_t) *mask++;
4928 d = (uint32_t) *dst;
4930 *dst++ = (uint8_t) pack_1x64_32 (
4932 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4933 unpack_32_1x64 (m)),
4934 unpack_32_1x64 (d)));
4938 /* call prefetch hint to optimize cache load*/
4939 cache_prefetch ((__m128i*)mask);
4940 cache_prefetch ((__m128i*)dst);
4944 /* fill cache line with next memory */
4945 cache_prefetch_next ((__m128i*)mask);
4946 cache_prefetch_next ((__m128i*)dst);
4948 xmm_mask = load_128_unaligned ((__m128i*)mask);
4949 xmm_dst = load_128_aligned ((__m128i*)dst);
4951 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4952 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4954 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4955 &xmm_mask_lo, &xmm_mask_hi,
4956 &xmm_mask_lo, &xmm_mask_hi);
4958 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4959 &xmm_dst_lo, &xmm_dst_hi,
4960 &xmm_dst_lo, &xmm_dst_hi);
4963 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4972 m = (uint32_t) *mask++;
4973 d = (uint32_t) *dst;
4975 *dst++ = (uint8_t) pack_1x64_32 (
4978 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4979 unpack_32_1x64 (d)));
4987 /* ---------------------------------------------------------------------------
4992 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4994 pixman_image_t * src_image,
4995 pixman_image_t * mask_image,
4996 pixman_image_t * dst_image,
5006 uint8_t *dst_line, *dst;
5007 uint8_t *src_line, *src;
5008 int src_stride, dst_stride;
5012 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5013 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5015 PIXMAN_IMAGE_GET_LINE (
5016 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5017 PIXMAN_IMAGE_GET_LINE (
5018 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5023 dst_line += dst_stride;
5025 src_line += src_stride;
5028 /* call prefetch hint to optimize cache load*/
5029 cache_prefetch ((__m128i*)src);
5030 cache_prefetch ((__m128i*)dst);
5032 while (w && ((unsigned long)dst & 15))
5034 s = (uint32_t) *src++;
5035 d = (uint32_t) *dst;
5037 *dst++ = (uint8_t) pack_1x64_32 (
5039 unpack_32_1x64 (s), unpack_32_1x64 (d)));
5043 /* call prefetch hint to optimize cache load*/
5044 cache_prefetch ((__m128i*)src);
5045 cache_prefetch ((__m128i*)dst);
5049 /* fill cache line with next memory */
5050 cache_prefetch_next ((__m128i*)src);
5051 cache_prefetch_next ((__m128i*)dst);
5053 xmm_src = load_128_unaligned ((__m128i*)src);
5054 xmm_dst = load_128_aligned ((__m128i*)dst);
5056 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5057 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5059 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5060 &xmm_dst_lo, &xmm_dst_hi,
5061 &xmm_dst_lo, &xmm_dst_hi);
5064 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5073 s = (uint32_t) *src++;
5074 d = (uint32_t) *dst;
5076 *dst++ = (uint8_t) pack_1x64_32 (
5077 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5085 /* -------------------------------------------------------------------------
5086 * composite_add_n_8_8
5090 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
5092 pixman_image_t * src_image,
5093 pixman_image_t * mask_image,
5094 pixman_image_t * dst_image,
5104 uint8_t *dst_line, *dst;
5105 uint8_t *mask_line, *mask;
5106 int dst_stride, mask_stride;
5113 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5114 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5116 PIXMAN_IMAGE_GET_LINE (
5117 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5118 PIXMAN_IMAGE_GET_LINE (
5119 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5121 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5125 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5130 dst_line += dst_stride;
5132 mask_line += mask_stride;
5135 /* call prefetch hint to optimize cache load*/
5136 cache_prefetch ((__m128i*)mask);
5137 cache_prefetch ((__m128i*)dst);
5139 while (w && ((unsigned long)dst & 15))
5141 m = (uint32_t) *mask++;
5142 d = (uint32_t) *dst;
5144 *dst++ = (uint8_t) pack_1x64_32 (
5147 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5148 unpack_32_1x64 (d)));
5152 /* call prefetch hint to optimize cache load*/
5153 cache_prefetch ((__m128i*)mask);
5154 cache_prefetch ((__m128i*)dst);
5158 /* fill cache line with next memory */
5159 cache_prefetch_next ((__m128i*)mask);
5160 cache_prefetch_next ((__m128i*)dst);
5162 xmm_mask = load_128_unaligned ((__m128i*)mask);
5163 xmm_dst = load_128_aligned ((__m128i*)dst);
5165 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5166 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5168 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5169 &xmm_mask_lo, &xmm_mask_hi,
5170 &xmm_mask_lo, &xmm_mask_hi);
5172 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5173 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5176 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5185 m = (uint32_t) *mask++;
5186 d = (uint32_t) *dst;
5188 *dst++ = (uint8_t) pack_1x64_32 (
5191 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5192 unpack_32_1x64 (d)));
5201 /* ----------------------------------------------------------------------
5202 * composite_add_8000_8000
5206 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5208 pixman_image_t * src_image,
5209 pixman_image_t * mask_image,
5210 pixman_image_t * dst_image,
5220 uint8_t *dst_line, *dst;
5221 uint8_t *src_line, *src;
5222 int dst_stride, src_stride;
5226 PIXMAN_IMAGE_GET_LINE (
5227 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5228 PIXMAN_IMAGE_GET_LINE (
5229 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5236 /* call prefetch hint to optimize cache load*/
5237 cache_prefetch ((__m128i*)src);
5238 cache_prefetch ((__m128i*)dst);
5240 dst_line += dst_stride;
5241 src_line += src_stride;
5245 while (w && (unsigned long)dst & 3)
5247 t = (*dst) + (*src++);
5248 *dst++ = t | (0 - (t >> 8));
5252 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5262 t = (*dst) + (*src++);
5263 *dst++ = t | (0 - (t >> 8));
5271 /* ---------------------------------------------------------------------
5272 * composite_add_8888_8888
5275 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5277 pixman_image_t * src_image,
5278 pixman_image_t * mask_image,
5279 pixman_image_t * dst_image,
5289 uint32_t *dst_line, *dst;
5290 uint32_t *src_line, *src;
5291 int dst_stride, src_stride;
5293 PIXMAN_IMAGE_GET_LINE (
5294 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5295 PIXMAN_IMAGE_GET_LINE (
5296 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5301 dst_line += dst_stride;
5303 src_line += src_stride;
5305 core_combine_add_u_sse2 (dst, src, NULL, width);
5311 /* -------------------------------------------------------------------------------------------------
5312 * sse2_composite_copy_area
5315 static pixman_bool_t
5316 pixman_blt_sse2 (uint32_t *src_bits,
5329 uint8_t * src_bytes;
5330 uint8_t * dst_bytes;
5333 if (src_bpp != dst_bpp)
5338 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5339 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5340 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5341 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5342 byte_width = 2 * width;
5346 else if (src_bpp == 32)
5348 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5349 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5350 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5351 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5352 byte_width = 4 * width;
5361 cache_prefetch ((__m128i*)src_bytes);
5362 cache_prefetch ((__m128i*)dst_bytes);
5367 uint8_t *s = src_bytes;
5368 uint8_t *d = dst_bytes;
5369 src_bytes += src_stride;
5370 dst_bytes += dst_stride;
5373 cache_prefetch_next ((__m128i*)s);
5374 cache_prefetch_next ((__m128i*)d);
5376 while (w >= 2 && ((unsigned long)d & 3))
5378 *(uint16_t *)d = *(uint16_t *)s;
5384 while (w >= 4 && ((unsigned long)d & 15))
5386 *(uint32_t *)d = *(uint32_t *)s;
5393 cache_prefetch_next ((__m128i*)s);
5394 cache_prefetch_next ((__m128i*)d);
5398 __m128i xmm0, xmm1, xmm2, xmm3;
5400 /* 128 bytes ahead */
5401 cache_prefetch (((__m128i*)s) + 8);
5402 cache_prefetch (((__m128i*)d) + 8);
5404 xmm0 = load_128_unaligned ((__m128i*)(s));
5405 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5406 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5407 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5409 save_128_aligned ((__m128i*)(d), xmm0);
5410 save_128_aligned ((__m128i*)(d + 16), xmm1);
5411 save_128_aligned ((__m128i*)(d + 32), xmm2);
5412 save_128_aligned ((__m128i*)(d + 48), xmm3);
5419 cache_prefetch_next ((__m128i*)s);
5420 cache_prefetch_next ((__m128i*)d);
5424 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5431 cache_prefetch_next ((__m128i*)s);
5432 cache_prefetch_next ((__m128i*)d);
5436 *(uint32_t *)d = *(uint32_t *)s;
5445 *(uint16_t *)d = *(uint16_t *)s;
5458 sse2_composite_copy_area (pixman_implementation_t *imp,
5460 pixman_image_t * src_image,
5461 pixman_image_t * mask_image,
5462 pixman_image_t * dst_image,
5472 pixman_blt_sse2 (src_image->bits.bits,
5473 dst_image->bits.bits,
5474 src_image->bits.rowstride,
5475 dst_image->bits.rowstride,
5476 PIXMAN_FORMAT_BPP (src_image->bits.format),
5477 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5478 src_x, src_y, dest_x, dest_y, width, height);
5482 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5484 pixman_image_t * src_image,
5485 pixman_image_t * mask_image,
5486 pixman_image_t * dst_image,
5496 uint32_t *src, *src_line, s;
5497 uint32_t *dst, *dst_line, d;
5498 uint8_t *mask, *mask_line;
5500 int src_stride, mask_stride, dst_stride;
5504 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5505 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5506 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5508 PIXMAN_IMAGE_GET_LINE (
5509 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5510 PIXMAN_IMAGE_GET_LINE (
5511 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5512 PIXMAN_IMAGE_GET_LINE (
5513 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5518 src_line += src_stride;
5520 dst_line += dst_stride;
5522 mask_line += mask_stride;
5526 /* call prefetch hint to optimize cache load*/
5527 cache_prefetch ((__m128i*)src);
5528 cache_prefetch ((__m128i*)dst);
5529 cache_prefetch ((__m128i*)mask);
5531 while (w && (unsigned long)dst & 15)
5533 s = 0xff000000 | *src++;
5534 m = (uint32_t) *mask++;
5536 ms = unpack_32_1x64 (s);
5540 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5541 __m64 md = unpack_32_1x64 (d);
5543 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5546 *dst++ = pack_1x64_32 (ms);
5550 /* call prefetch hint to optimize cache load*/
5551 cache_prefetch ((__m128i*)src);
5552 cache_prefetch ((__m128i*)dst);
5553 cache_prefetch ((__m128i*)mask);
5557 /* fill cache line with next memory */
5558 cache_prefetch_next ((__m128i*)src);
5559 cache_prefetch_next ((__m128i*)dst);
5560 cache_prefetch_next ((__m128i*)mask);
5562 m = *(uint32_t*) mask;
5563 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5565 if (m == 0xffffffff)
5567 save_128_aligned ((__m128i*)dst, xmm_src);
5571 xmm_dst = load_128_aligned ((__m128i*)dst);
5573 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5575 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5576 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5577 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5579 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5581 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5583 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5594 m = (uint32_t) *mask++;
5598 s = 0xff000000 | *src;
5610 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5611 md = unpack_32_1x64 (d);
5612 ms = unpack_32_1x64 (s);
5614 *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5629 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5631 pixman_image_t * src_image,
5632 pixman_image_t * mask_image,
5633 pixman_image_t * dst_image,
5643 uint32_t *src, *src_line, s;
5644 uint32_t *dst, *dst_line, d;
5645 uint8_t *mask, *mask_line;
5647 int src_stride, mask_stride, dst_stride;
5650 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5651 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5652 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5654 PIXMAN_IMAGE_GET_LINE (
5655 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5656 PIXMAN_IMAGE_GET_LINE (
5657 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5658 PIXMAN_IMAGE_GET_LINE (
5659 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5664 src_line += src_stride;
5666 dst_line += dst_stride;
5668 mask_line += mask_stride;
5672 /* call prefetch hint to optimize cache load*/
5673 cache_prefetch ((__m128i *)src);
5674 cache_prefetch ((__m128i *)dst);
5675 cache_prefetch ((__m128i *)mask);
5677 while (w && (unsigned long)dst & 15)
5682 m = (uint32_t) *mask++;
5689 if (sa == 0xff && m == 0xff)
5695 __m64 ms, md, ma, msa;
5697 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5698 ms = unpack_32_1x64 (s);
5699 md = unpack_32_1x64 (d);
5701 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5703 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5711 /* call prefetch hint to optimize cache load*/
5712 cache_prefetch ((__m128i *)src);
5713 cache_prefetch ((__m128i *)dst);
5714 cache_prefetch ((__m128i *)mask);
5718 /* fill cache line with next memory */
5719 cache_prefetch_next ((__m128i *)src);
5720 cache_prefetch_next ((__m128i *)dst);
5721 cache_prefetch_next ((__m128i *)mask);
5723 m = *(uint32_t *) mask;
5727 xmm_src = load_128_unaligned ((__m128i*)src);
5729 if (m == 0xffffffff && is_opaque (xmm_src))
5731 save_128_aligned ((__m128i *)dst, xmm_src);
5735 xmm_dst = load_128_aligned ((__m128i *)dst);
5737 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5739 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5740 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5741 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5743 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5744 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5746 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5747 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5749 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5764 m = (uint32_t) *mask++;
5771 if (sa == 0xff && m == 0xff)
5777 __m64 ms, md, ma, msa;
5779 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5780 ms = unpack_32_1x64 (s);
5781 md = unpack_32_1x64 (d);
5783 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5785 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5798 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5800 pixman_image_t * src_image,
5801 pixman_image_t * mask_image,
5802 pixman_image_t * dst_image,
5813 uint32_t *dst_line, *dst;
5815 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5816 __m128i xmm_dsta_hi, xmm_dsta_lo;
5820 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5825 PIXMAN_IMAGE_GET_LINE (
5826 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5828 xmm_src = expand_pixel_32_1x128 (src);
5834 /* call prefetch hint to optimize cache load*/
5835 cache_prefetch ((__m128i*)dst);
5837 dst_line += dst_stride;
5840 while (w && (unsigned long)dst & 15)
5844 vd = unpack_32_1x64 (*dst);
5846 *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5847 _mm_movepi64_pi64 (xmm_src)));
5852 cache_prefetch ((__m128i*)dst);
5856 __m128i tmp_lo, tmp_hi;
5858 /* fill cache line with next memory */
5859 cache_prefetch_next ((__m128i*)(dst + 4));
5861 xmm_dst = load_128_aligned ((__m128i*)dst);
5863 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5864 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5869 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5870 &xmm_dsta_lo, &xmm_dsta_hi,
5874 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5884 vd = unpack_32_1x64 (*dst);
5886 *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
5887 _mm_movepi64_pi64 (xmm_src)));
5898 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5900 pixman_image_t * src_image,
5901 pixman_image_t * mask_image,
5902 pixman_image_t * dst_image,
5912 uint32_t *src, *src_line, s;
5913 uint32_t *dst, *dst_line, d;
5914 uint32_t *mask, *mask_line;
5916 int src_stride, mask_stride, dst_stride;
5919 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5920 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5921 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5923 PIXMAN_IMAGE_GET_LINE (
5924 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5925 PIXMAN_IMAGE_GET_LINE (
5926 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5927 PIXMAN_IMAGE_GET_LINE (
5928 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5933 src_line += src_stride;
5935 dst_line += dst_stride;
5937 mask_line += mask_stride;
5941 /* call prefetch hint to optimize cache load*/
5942 cache_prefetch ((__m128i *)src);
5943 cache_prefetch ((__m128i *)dst);
5944 cache_prefetch ((__m128i *)mask);
5946 while (w && (unsigned long)dst & 15)
5951 m = (*mask++) >> 24;
5958 if (sa == 0xff && m == 0xff)
5964 __m64 ms, md, ma, msa;
5966 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5967 ms = unpack_32_1x64 (s);
5968 md = unpack_32_1x64 (d);
5970 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5972 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5980 /* call prefetch hint to optimize cache load*/
5981 cache_prefetch ((__m128i *)src);
5982 cache_prefetch ((__m128i *)dst);
5983 cache_prefetch ((__m128i *)mask);
5987 /* fill cache line with next memory */
5988 cache_prefetch_next ((__m128i *)src);
5989 cache_prefetch_next ((__m128i *)dst);
5990 cache_prefetch_next ((__m128i *)mask);
5992 xmm_mask = load_128_unaligned ((__m128i*)mask);
5994 if (!is_transparent (xmm_mask))
5996 xmm_src = load_128_unaligned ((__m128i*)src);
5998 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
6000 save_128_aligned ((__m128i *)dst, xmm_src);
6004 xmm_dst = load_128_aligned ((__m128i *)dst);
6006 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6007 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
6008 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6010 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
6011 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
6013 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
6014 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
6016 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6031 m = (*mask++) >> 24;
6038 if (sa == 0xff && m == 0xff)
6044 __m64 ms, md, ma, msa;
6046 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
6047 ms = unpack_32_1x64 (s);
6048 md = unpack_32_1x64 (d);
6050 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
6052 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
6064 static const pixman_fast_path_t sse2_fast_paths[] =
6066 /* PIXMAN_OP_OVER */
6067 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6068 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6069 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6070 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6071 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6072 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6073 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6074 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6075 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6076 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6077 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6078 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6079 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6080 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6081 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6082 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6083 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6084 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6085 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6086 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6087 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6088 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6089 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6090 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6091 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6092 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6093 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6094 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6095 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6096 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6097 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6098 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6099 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6100 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6101 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6102 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6103 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6104 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6105 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6106 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6107 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6108 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6109 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6110 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6111 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6112 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6114 /* PIXMAN_OP_OVER_REVERSE */
6115 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6116 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6119 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6120 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
6121 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6122 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6123 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6126 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6127 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6128 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6129 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6130 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6131 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6132 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6133 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6134 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6135 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6136 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6137 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6140 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6141 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6146 static pixman_bool_t
6147 sse2_blt (pixman_implementation_t *imp,
6148 uint32_t * src_bits,
6149 uint32_t * dst_bits,
6161 if (!pixman_blt_sse2 (
6162 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6163 src_x, src_y, dst_x, dst_y, width, height))
6166 return _pixman_implementation_blt (
6168 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
6169 src_x, src_y, dst_x, dst_y, width, height);
6175 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6176 __attribute__((__force_align_arg_pointer__))
6178 static pixman_bool_t
6179 sse2_fill (pixman_implementation_t *imp,
6189 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
6191 return _pixman_implementation_fill (
6192 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
6198 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6199 __attribute__((__force_align_arg_pointer__))
6201 pixman_implementation_t *
6202 _pixman_implementation_create_sse2 (void)
6205 pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
6207 pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
6209 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6211 /* SSE2 constants */
6212 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6213 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6214 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6215 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6216 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6217 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6218 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6219 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6220 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6221 mask_0080 = create_mask_16_128 (0x0080);
6222 mask_00ff = create_mask_16_128 (0x00ff);
6223 mask_0101 = create_mask_16_128 (0x0101);
6224 mask_ffff = create_mask_16_128 (0xffff);
6225 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6226 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6229 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
6230 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
6232 mask_x0080 = create_mask_16_64 (0x0080);
6233 mask_x00ff = create_mask_16_64 (0x00ff);
6234 mask_x0101 = create_mask_16_64 (0x0101);
6235 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
6239 /* Set up function pointers */
6241 /* SSE code patch for fbcompose.c */
6242 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6243 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6244 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6245 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6246 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6247 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6248 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6249 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6250 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6251 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6253 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6255 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6256 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6257 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6258 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6259 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6260 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6261 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6262 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6263 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6264 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6265 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6267 imp->blt = sse2_blt;
6268 imp->fill = sse2_fill;
6273 #endif /* USE_SSE2 */