2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41 * the pixman-x64-mmx-emulation.h file contains
42 * implementations of those MMX intrinsics that
43 * are used in the SSE2 implementation.
45 # include "pixman-x64-mmx-emulation.h"
50 /* --------------------------------------------------------------------
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
79 /* ----------------------------------------------------------------------
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
85 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
91 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
98 __m128i r, g, b, rb, t;
100 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
104 rb = _mm_or_si128 (r, b);
105 t = _mm_and_si128 (rb, mask_565_fix_rb);
106 t = _mm_srli_epi32 (t, 5);
107 rb = _mm_or_si128 (rb, t);
109 t = _mm_and_si128 (g, mask_565_fix_g);
110 t = _mm_srli_epi32 (t, 6);
111 g = _mm_or_si128 (g, t);
113 return _mm_or_si128 (rb, g);
116 static force_inline void
117 unpack_565_128_4x128 (__m128i data,
125 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
128 lo = unpack_565_to_8888 (lo);
129 hi = unpack_565_to_8888 (hi);
131 unpack_128_2x128 (lo, data0, data1);
132 unpack_128_2x128 (hi, data2, data3);
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
138 return (uint16_t) (((pixel >> 8) & 0xf800) |
139 ((pixel >> 5) & 0x07e0) |
140 ((pixel >> 3) & 0x001f));
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
146 return _mm_packus_epi16 (lo, hi);
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
153 __m128i r, g1, g2, b;
155 data = pack_2x128_128 (lo, hi);
157 r = _mm_and_si128 (data, mask_565_r);
158 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
162 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
168 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169 pack_565_2x128_128 (*xmm2, *xmm3));
172 static force_inline int
173 is_opaque (__m128i x)
175 __m128i ffs = _mm_cmpeq_epi8 (x, x);
177 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
180 static force_inline int
183 return _mm_movemask_epi8 (
184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
187 static force_inline int
188 is_transparent (__m128i x)
190 return (_mm_movemask_epi8 (
191 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
197 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
203 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204 _MM_SHUFFLE (3, 3, 3, 3)),
205 _MM_SHUFFLE (3, 3, 3, 3));
208 static force_inline void
209 expand_alpha_2x128 (__m128i data_lo,
216 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
219 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i data_lo,
231 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
247 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249 lo = _mm_adds_epu16 (lo, mask_0080);
250 hi = _mm_adds_epu16 (hi, mask_0080);
251 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
258 __m128i* alpha_dst_lo,
259 __m128i* alpha_dst_hi,
262 __m128i* alpha_src_lo,
263 __m128i* alpha_src_hi,
267 __m128i t1_lo, t1_hi;
268 __m128i t2_lo, t2_hi;
270 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
273 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
277 static force_inline void
278 negate_2x128 (__m128i data_lo,
283 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
287 static force_inline void
288 invert_colors_2x128 (__m128i data_lo,
295 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
311 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
313 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
315 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i src_lo,
326 __m128i alpha_lo, alpha_hi;
328 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
330 lo = _mm_or_si128 (alpha_lo, mask_alpha);
331 hi = _mm_or_si128 (alpha_hi, mask_alpha);
333 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
335 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
337 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
353 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
356 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
359 static force_inline void
360 cache_prefetch (__m128i* addr)
362 _mm_prefetch ((void const*)addr, _MM_HINT_T0);
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
368 _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
371 /* prefetching NULL is very slow on some systems. don't do that. */
373 static force_inline void
374 maybe_prefetch (__m128i* addr)
377 cache_prefetch (addr);
380 static force_inline void
381 maybe_prefetch_next (__m128i* addr)
384 cache_prefetch_next (addr);
387 /* load 4 pixels from a 16-byte boundary aligned address */
388 static force_inline __m128i
389 load_128_aligned (__m128i* src)
391 return _mm_load_si128 (src);
394 /* load 4 pixels from a unaligned address */
395 static force_inline __m128i
396 load_128_unaligned (const __m128i* src)
398 return _mm_loadu_si128 (src);
401 /* save 4 pixels using Write Combining memory on a 16-byte
402 * boundary aligned address
404 static force_inline void
405 save_128_write_combining (__m128i* dst,
408 _mm_stream_si128 (dst, data);
411 /* save 4 pixels on a 16-byte boundary aligned address */
412 static force_inline void
413 save_128_aligned (__m128i* dst,
416 _mm_store_si128 (dst, data);
419 /* save 4 pixels on a unaligned address */
420 static force_inline void
421 save_128_unaligned (__m128i* dst,
424 _mm_storeu_si128 (dst, data);
427 /* ------------------------------------------------------------------
431 static force_inline __m64
432 load_32_1x64 (uint32_t data)
434 return _mm_cvtsi32_si64 (data);
437 static force_inline __m64
438 unpack_32_1x64 (uint32_t data)
440 return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
443 static force_inline __m64
444 expand_alpha_1x64 (__m64 data)
446 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
449 static force_inline __m64
450 expand_alpha_rev_1x64 (__m64 data)
452 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
455 static force_inline __m64
456 expand_pixel_8_1x64 (uint8_t data)
458 return _mm_shuffle_pi16 (
459 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
462 static force_inline __m64
463 pix_multiply_1x64 (__m64 data,
466 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
471 static force_inline __m64
472 pix_add_multiply_1x64 (__m64* src,
477 __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
478 __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
480 return _mm_adds_pu8 (t1, t2);
483 static force_inline __m64
484 negate_1x64 (__m64 data)
486 return _mm_xor_si64 (data, mask_x00ff);
489 static force_inline __m64
490 invert_colors_1x64 (__m64 data)
492 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
495 static force_inline __m64
496 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
498 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
501 static force_inline __m64
502 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
504 return over_1x64 (pix_multiply_1x64 (*src, *mask),
505 pix_multiply_1x64 (*alpha, *mask),
509 static force_inline __m64
510 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
512 __m64 alpha = expand_alpha_1x64 (src);
514 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
515 _mm_or_si64 (alpha, mask_x_alpha)),
520 static force_inline uint32_t
521 pack_1x64_32 (__m64 data)
523 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
526 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
530 * --- Expanding 565 in the low word ---
532 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
533 * m = m & (01f0003f001f);
534 * m = m * (008404100840);
537 * Note the trick here - the top word is shifted by another nibble to
538 * avoid it bumping into the middle word
540 static force_inline __m64
541 expand565_16_1x64 (uint16_t pixel)
546 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
548 t1 = _mm_slli_si64 (p, 36 - 11);
549 t2 = _mm_slli_si64 (p, 16 - 5);
551 p = _mm_or_si64 (t1, p);
552 p = _mm_or_si64 (t2, p);
553 p = _mm_and_si64 (p, mask_x565_rgb);
554 p = _mm_mullo_pi16 (p, mask_x565_unpack);
556 return _mm_srli_pi16 (p, 8);
559 /* ----------------------------------------------------------------------------
560 * Compose Core transformations
562 static force_inline uint32_t
563 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
576 ms = unpack_32_1x64 (src);
577 return pack_1x64_32 (
578 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
584 static force_inline uint32_t
585 combine1 (const uint32_t *ps, const uint32_t *pm)
593 mm = unpack_32_1x64 (*pm);
594 mm = expand_alpha_1x64 (mm);
596 ms = unpack_32_1x64 (s);
597 ms = pix_multiply_1x64 (ms, mm);
599 s = pack_1x64_32 (ms);
605 static force_inline __m128i
606 combine4 (const __m128i *ps, const __m128i *pm)
608 __m128i xmm_src_lo, xmm_src_hi;
609 __m128i xmm_msk_lo, xmm_msk_hi;
614 xmm_msk_lo = load_128_unaligned (pm);
616 if (is_transparent (xmm_msk_lo))
617 return _mm_setzero_si128 ();
620 s = load_128_unaligned (ps);
624 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
625 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
627 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
629 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
630 &xmm_msk_lo, &xmm_msk_hi,
631 &xmm_src_lo, &xmm_src_hi);
633 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
639 static force_inline void
640 core_combine_over_u_sse2 (uint32_t* pd,
647 __m128i xmm_dst_lo, xmm_dst_hi;
648 __m128i xmm_src_lo, xmm_src_hi;
649 __m128i xmm_alpha_lo, xmm_alpha_hi;
651 /* call prefetch hint to optimize cache load*/
652 cache_prefetch ((__m128i*)ps);
653 cache_prefetch ((__m128i*)pd);
654 maybe_prefetch ((__m128i*)pm);
656 /* Align dst on a 16-byte boundary */
657 while (w && ((unsigned long)pd & 15))
660 s = combine1 (ps, pm);
662 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
669 /* call prefetch hint to optimize cache load*/
670 cache_prefetch ((__m128i*)ps);
671 cache_prefetch ((__m128i*)pd);
672 maybe_prefetch ((__m128i*)pm);
676 /* fill cache line with next memory */
677 cache_prefetch_next ((__m128i*)ps);
678 cache_prefetch_next ((__m128i*)pd);
679 maybe_prefetch_next ((__m128i*)pm);
681 /* I'm loading unaligned because I'm not sure about
682 * the address alignment.
684 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
686 if (is_opaque (xmm_src_hi))
688 save_128_aligned ((__m128i*)pd, xmm_src_hi);
690 else if (!is_zero (xmm_src_hi))
692 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
694 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
695 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
698 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
700 over_2x128 (&xmm_src_lo, &xmm_src_hi,
701 &xmm_alpha_lo, &xmm_alpha_hi,
702 &xmm_dst_lo, &xmm_dst_hi);
704 /* rebuid the 4 pixel data and save*/
705 save_128_aligned ((__m128i*)pd,
706 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
719 s = combine1 (ps, pm);
721 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
730 static force_inline void
731 core_combine_over_reverse_u_sse2 (uint32_t* pd,
738 __m128i xmm_dst_lo, xmm_dst_hi;
739 __m128i xmm_src_lo, xmm_src_hi;
740 __m128i xmm_alpha_lo, xmm_alpha_hi;
742 /* call prefetch hint to optimize cache load*/
743 cache_prefetch ((__m128i*)ps);
744 cache_prefetch ((__m128i*)pd);
745 maybe_prefetch ((__m128i*)pm);
747 /* Align dst on a 16-byte boundary */
749 ((unsigned long)pd & 15))
752 s = combine1 (ps, pm);
754 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
761 /* call prefetch hint to optimize cache load*/
762 cache_prefetch ((__m128i*)ps);
763 cache_prefetch ((__m128i*)pd);
764 maybe_prefetch ((__m128i*)pm);
768 /* fill cache line with next memory */
769 cache_prefetch_next ((__m128i*)ps);
770 cache_prefetch_next ((__m128i*)pd);
771 maybe_prefetch_next ((__m128i*)pm);
773 /* I'm loading unaligned because I'm not sure
774 * about the address alignment.
776 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
777 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
779 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
780 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
782 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
783 &xmm_alpha_lo, &xmm_alpha_hi);
785 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
786 &xmm_alpha_lo, &xmm_alpha_hi,
787 &xmm_src_lo, &xmm_src_hi);
789 /* rebuid the 4 pixel data and save*/
790 save_128_aligned ((__m128i*)pd,
791 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
804 s = combine1 (ps, pm);
806 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
814 static force_inline uint32_t
815 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
817 uint32_t maska = src >> 24;
823 else if (maska != 0xff)
825 return pack_1x64_32 (
826 pix_multiply_1x64 (unpack_32_1x64 (dst),
827 expand_alpha_1x64 (unpack_32_1x64 (src))));
833 static force_inline void
834 core_combine_in_u_sse2 (uint32_t* pd,
841 __m128i xmm_src_lo, xmm_src_hi;
842 __m128i xmm_dst_lo, xmm_dst_hi;
844 /* call prefetch hint to optimize cache load*/
845 cache_prefetch ((__m128i*)ps);
846 cache_prefetch ((__m128i*)pd);
847 maybe_prefetch ((__m128i*)pm);
849 while (w && ((unsigned long) pd & 15))
851 s = combine1 (ps, pm);
854 *pd++ = core_combine_in_u_pixelsse2 (d, s);
861 /* call prefetch hint to optimize cache load*/
862 cache_prefetch ((__m128i*)ps);
863 cache_prefetch ((__m128i*)pd);
864 maybe_prefetch ((__m128i*)pm);
868 /* fill cache line with next memory */
869 cache_prefetch_next ((__m128i*)ps);
870 cache_prefetch_next ((__m128i*)pd);
871 maybe_prefetch_next ((__m128i*)pm);
873 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
874 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
876 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
877 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
879 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
880 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
881 &xmm_dst_lo, &xmm_dst_hi,
882 &xmm_dst_lo, &xmm_dst_hi);
884 save_128_aligned ((__m128i*)pd,
885 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
896 s = combine1 (ps, pm);
899 *pd++ = core_combine_in_u_pixelsse2 (d, s);
907 static force_inline void
908 core_combine_reverse_in_u_sse2 (uint32_t* pd,
915 __m128i xmm_src_lo, xmm_src_hi;
916 __m128i xmm_dst_lo, xmm_dst_hi;
918 /* call prefetch hint to optimize cache load*/
919 cache_prefetch ((__m128i*)ps);
920 cache_prefetch ((__m128i*)pd);
921 maybe_prefetch ((__m128i*)pm);
923 while (w && ((unsigned long) pd & 15))
925 s = combine1 (ps, pm);
928 *pd++ = core_combine_in_u_pixelsse2 (s, d);
935 /* call prefetch hint to optimize cache load*/
936 cache_prefetch ((__m128i*)ps);
937 cache_prefetch ((__m128i*)pd);
938 maybe_prefetch ((__m128i*)pm);
942 /* fill cache line with next memory */
943 cache_prefetch_next ((__m128i*)ps);
944 cache_prefetch_next ((__m128i*)pd);
945 maybe_prefetch_next ((__m128i*)pm);
947 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
948 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
950 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
951 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
953 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
954 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
955 &xmm_src_lo, &xmm_src_hi,
956 &xmm_dst_lo, &xmm_dst_hi);
959 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
970 s = combine1 (ps, pm);
973 *pd++ = core_combine_in_u_pixelsse2 (s, d);
981 static force_inline void
982 core_combine_reverse_out_u_sse2 (uint32_t* pd,
987 /* call prefetch hint to optimize cache load*/
988 cache_prefetch ((__m128i*)ps);
989 cache_prefetch ((__m128i*)pd);
990 maybe_prefetch ((__m128i*)pm);
992 while (w && ((unsigned long) pd & 15))
994 uint32_t s = combine1 (ps, pm);
997 *pd++ = pack_1x64_32 (
999 unpack_32_1x64 (d), negate_1x64 (
1000 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1008 /* call prefetch hint to optimize cache load*/
1009 cache_prefetch ((__m128i*)ps);
1010 cache_prefetch ((__m128i*)pd);
1011 maybe_prefetch ((__m128i*)pm);
1015 __m128i xmm_src_lo, xmm_src_hi;
1016 __m128i xmm_dst_lo, xmm_dst_hi;
1018 /* fill cache line with next memory */
1019 cache_prefetch_next ((__m128i*)ps);
1020 cache_prefetch_next ((__m128i*)pd);
1021 maybe_prefetch_next ((__m128i*)pm);
1023 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1024 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1026 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1027 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1029 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1030 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1032 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1033 &xmm_src_lo, &xmm_src_hi,
1034 &xmm_dst_lo, &xmm_dst_hi);
1037 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1049 uint32_t s = combine1 (ps, pm);
1052 *pd++ = pack_1x64_32 (
1054 unpack_32_1x64 (d), negate_1x64 (
1055 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1063 static force_inline void
1064 core_combine_out_u_sse2 (uint32_t* pd,
1069 /* call prefetch hint to optimize cache load*/
1070 cache_prefetch ((__m128i*)ps);
1071 cache_prefetch ((__m128i*)pd);
1072 maybe_prefetch ((__m128i*)pm);
1074 while (w && ((unsigned long) pd & 15))
1076 uint32_t s = combine1 (ps, pm);
1079 *pd++ = pack_1x64_32 (
1081 unpack_32_1x64 (s), negate_1x64 (
1082 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1089 /* call prefetch hint to optimize cache load*/
1090 cache_prefetch ((__m128i*)ps);
1091 cache_prefetch ((__m128i*)pd);
1092 maybe_prefetch ((__m128i*)pm);
1096 __m128i xmm_src_lo, xmm_src_hi;
1097 __m128i xmm_dst_lo, xmm_dst_hi;
1099 /* fill cache line with next memory */
1100 cache_prefetch_next ((__m128i*)ps);
1101 cache_prefetch_next ((__m128i*)pd);
1102 maybe_prefetch_next ((__m128i*)pm);
1104 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1105 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1107 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1108 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1110 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1111 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1113 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1114 &xmm_dst_lo, &xmm_dst_hi,
1115 &xmm_dst_lo, &xmm_dst_hi);
1118 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1129 uint32_t s = combine1 (ps, pm);
1132 *pd++ = pack_1x64_32 (
1134 unpack_32_1x64 (s), negate_1x64 (
1135 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1143 static force_inline uint32_t
1144 core_combine_atop_u_pixel_sse2 (uint32_t src,
1147 __m64 s = unpack_32_1x64 (src);
1148 __m64 d = unpack_32_1x64 (dst);
1150 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1151 __m64 da = expand_alpha_1x64 (d);
1153 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1156 static force_inline void
1157 core_combine_atop_u_sse2 (uint32_t* pd,
1164 __m128i xmm_src_lo, xmm_src_hi;
1165 __m128i xmm_dst_lo, xmm_dst_hi;
1166 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1167 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1169 /* call prefetch hint to optimize cache load*/
1170 cache_prefetch ((__m128i*)ps);
1171 cache_prefetch ((__m128i*)pd);
1172 maybe_prefetch ((__m128i*)pm);
1174 while (w && ((unsigned long) pd & 15))
1176 s = combine1 (ps, pm);
1179 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1186 /* call prefetch hint to optimize cache load*/
1187 cache_prefetch ((__m128i*)ps);
1188 cache_prefetch ((__m128i*)pd);
1189 maybe_prefetch ((__m128i*)pm);
1193 /* fill cache line with next memory */
1194 cache_prefetch_next ((__m128i*)ps);
1195 cache_prefetch_next ((__m128i*)pd);
1196 maybe_prefetch_next ((__m128i*)pm);
1198 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1199 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1201 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1202 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1204 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1205 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1206 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1207 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1209 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1210 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1212 pix_add_multiply_2x128 (
1213 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1214 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1215 &xmm_dst_lo, &xmm_dst_hi);
1218 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1229 s = combine1 (ps, pm);
1232 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1240 static force_inline uint32_t
1241 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1244 __m64 s = unpack_32_1x64 (src);
1245 __m64 d = unpack_32_1x64 (dst);
1247 __m64 sa = expand_alpha_1x64 (s);
1248 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1250 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1253 static force_inline void
1254 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1261 __m128i xmm_src_lo, xmm_src_hi;
1262 __m128i xmm_dst_lo, xmm_dst_hi;
1263 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1264 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1266 /* call prefetch hint to optimize cache load*/
1267 cache_prefetch ((__m128i*)ps);
1268 cache_prefetch ((__m128i*)pd);
1269 maybe_prefetch ((__m128i*)pm);
1271 while (w && ((unsigned long) pd & 15))
1273 s = combine1 (ps, pm);
1276 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1283 /* call prefetch hint to optimize cache load*/
1284 cache_prefetch ((__m128i*)ps);
1285 cache_prefetch ((__m128i*)pd);
1286 maybe_prefetch ((__m128i*)pm);
1290 /* fill cache line with next memory */
1291 cache_prefetch_next ((__m128i*)ps);
1292 cache_prefetch_next ((__m128i*)pd);
1293 maybe_prefetch_next ((__m128i*)pm);
1295 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1296 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1298 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1299 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1301 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1302 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1303 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1304 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1306 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1307 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1309 pix_add_multiply_2x128 (
1310 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1311 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1312 &xmm_dst_lo, &xmm_dst_hi);
1315 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1326 s = combine1 (ps, pm);
1329 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1337 static force_inline uint32_t
1338 core_combine_xor_u_pixel_sse2 (uint32_t src,
1341 __m64 s = unpack_32_1x64 (src);
1342 __m64 d = unpack_32_1x64 (dst);
1344 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1345 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1347 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1350 static force_inline void
1351 core_combine_xor_u_sse2 (uint32_t* dst,
1352 const uint32_t* src,
1353 const uint32_t *mask,
1359 const uint32_t* ps = src;
1360 const uint32_t* pm = mask;
1362 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1363 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1364 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1365 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1367 /* call prefetch hint to optimize cache load*/
1368 cache_prefetch ((__m128i*)ps);
1369 cache_prefetch ((__m128i*)pd);
1370 maybe_prefetch ((__m128i*)pm);
1372 while (w && ((unsigned long) pd & 15))
1374 s = combine1 (ps, pm);
1377 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1384 /* call prefetch hint to optimize cache load*/
1385 cache_prefetch ((__m128i*)ps);
1386 cache_prefetch ((__m128i*)pd);
1387 maybe_prefetch ((__m128i*)pm);
1391 /* fill cache line with next memory */
1392 cache_prefetch_next ((__m128i*)ps);
1393 cache_prefetch_next ((__m128i*)pd);
1394 maybe_prefetch_next ((__m128i*)pm);
1396 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1397 xmm_dst = load_128_aligned ((__m128i*) pd);
1399 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1400 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1402 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1403 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1404 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1405 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1407 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1408 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1409 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1410 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1412 pix_add_multiply_2x128 (
1413 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1414 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1415 &xmm_dst_lo, &xmm_dst_hi);
1418 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1429 s = combine1 (ps, pm);
1432 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1440 static force_inline void
1441 core_combine_add_u_sse2 (uint32_t* dst,
1442 const uint32_t* src,
1443 const uint32_t* mask,
1449 const uint32_t* ps = src;
1450 const uint32_t* pm = mask;
1452 /* call prefetch hint to optimize cache load*/
1453 cache_prefetch ((__m128i*)ps);
1454 cache_prefetch ((__m128i*)pd);
1455 maybe_prefetch ((__m128i*)pm);
1457 while (w && (unsigned long)pd & 15)
1459 s = combine1 (ps, pm);
1465 *pd++ = _mm_cvtsi64_si32 (
1466 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1470 /* call prefetch hint to optimize cache load*/
1471 cache_prefetch ((__m128i*)ps);
1472 cache_prefetch ((__m128i*)pd);
1473 maybe_prefetch ((__m128i*)pm);
1479 /* fill cache line with next memory */
1480 cache_prefetch_next ((__m128i*)ps);
1481 cache_prefetch_next ((__m128i*)pd);
1482 maybe_prefetch_next ((__m128i*)pm);
1484 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1487 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1498 s = combine1 (ps, pm);
1502 *pd++ = _mm_cvtsi64_si32 (
1503 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1509 static force_inline uint32_t
1510 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1513 __m64 ms = unpack_32_1x64 (src);
1514 __m64 md = unpack_32_1x64 (dst);
1515 uint32_t sa = src >> 24;
1516 uint32_t da = ~dst >> 24;
1520 ms = pix_multiply_1x64 (
1521 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1524 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1527 static force_inline void
1528 core_combine_saturate_u_sse2 (uint32_t * pd,
1536 __m128i xmm_src, xmm_dst;
1538 /* call prefetch hint to optimize cache load*/
1539 cache_prefetch ((__m128i*)ps);
1540 cache_prefetch ((__m128i*)pd);
1541 maybe_prefetch ((__m128i*)pm);
1543 while (w && (unsigned long)pd & 15)
1545 s = combine1 (ps, pm);
1548 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1555 /* call prefetch hint to optimize cache load*/
1556 cache_prefetch ((__m128i*)ps);
1557 cache_prefetch ((__m128i*)pd);
1558 maybe_prefetch ((__m128i*)pm);
1562 /* fill cache line with next memory */
1563 cache_prefetch_next ((__m128i*)ps);
1564 cache_prefetch_next ((__m128i*)pd);
1565 maybe_prefetch_next ((__m128i*)pm);
1567 xmm_dst = load_128_aligned ((__m128i*)pd);
1568 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1570 pack_cmp = _mm_movemask_epi8 (
1572 _mm_srli_epi32 (xmm_src, 24),
1573 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1575 /* if some alpha src is grater than respective ~alpha dst */
1578 s = combine1 (ps++, pm);
1580 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1584 s = combine1 (ps++, pm);
1586 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1590 s = combine1 (ps++, pm);
1592 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1596 s = combine1 (ps++, pm);
1598 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1604 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1617 s = combine1 (ps, pm);
1620 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1627 static force_inline void
1628 core_combine_src_ca_sse2 (uint32_t* pd,
1635 __m128i xmm_src_lo, xmm_src_hi;
1636 __m128i xmm_mask_lo, xmm_mask_hi;
1637 __m128i xmm_dst_lo, xmm_dst_hi;
1639 /* call prefetch hint to optimize cache load*/
1640 cache_prefetch ((__m128i*)ps);
1641 cache_prefetch ((__m128i*)pd);
1642 cache_prefetch ((__m128i*)pm);
1644 while (w && (unsigned long)pd & 15)
1648 *pd++ = pack_1x64_32 (
1649 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1653 /* call prefetch hint to optimize cache load*/
1654 cache_prefetch ((__m128i*)ps);
1655 cache_prefetch ((__m128i*)pd);
1656 cache_prefetch ((__m128i*)pm);
1660 /* fill cache line with next memory */
1661 cache_prefetch_next ((__m128i*)ps);
1662 cache_prefetch_next ((__m128i*)pd);
1663 cache_prefetch_next ((__m128i*)pm);
1665 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1666 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1668 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1669 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1671 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1672 &xmm_mask_lo, &xmm_mask_hi,
1673 &xmm_dst_lo, &xmm_dst_hi);
1676 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1688 *pd++ = pack_1x64_32 (
1689 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1694 static force_inline uint32_t
1695 core_combine_over_ca_pixel_sse2 (uint32_t src,
1699 __m64 s = unpack_32_1x64 (src);
1700 __m64 expAlpha = expand_alpha_1x64 (s);
1701 __m64 unpk_mask = unpack_32_1x64 (mask);
1702 __m64 unpk_dst = unpack_32_1x64 (dst);
1704 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1707 static force_inline void
1708 core_combine_over_ca_sse2 (uint32_t* pd,
1715 __m128i xmm_alpha_lo, xmm_alpha_hi;
1716 __m128i xmm_src_lo, xmm_src_hi;
1717 __m128i xmm_dst_lo, xmm_dst_hi;
1718 __m128i xmm_mask_lo, xmm_mask_hi;
1720 /* call prefetch hint to optimize cache load*/
1721 cache_prefetch ((__m128i*)ps);
1722 cache_prefetch ((__m128i*)pd);
1723 cache_prefetch ((__m128i*)pm);
1725 while (w && (unsigned long)pd & 15)
1731 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1735 /* call prefetch hint to optimize cache load*/
1736 cache_prefetch ((__m128i*)ps);
1737 cache_prefetch ((__m128i*)pd);
1738 cache_prefetch ((__m128i*)pm);
1742 /* fill cache line with next memory */
1743 cache_prefetch_next ((__m128i*)ps);
1744 cache_prefetch_next ((__m128i*)pd);
1745 cache_prefetch_next ((__m128i*)pm);
1747 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1748 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1749 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1751 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1752 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1753 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1755 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1756 &xmm_alpha_lo, &xmm_alpha_hi);
1758 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1759 &xmm_alpha_lo, &xmm_alpha_hi,
1760 &xmm_mask_lo, &xmm_mask_hi,
1761 &xmm_dst_lo, &xmm_dst_hi);
1764 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1778 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1783 static force_inline uint32_t
1784 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1788 __m64 d = unpack_32_1x64 (dst);
1790 return pack_1x64_32 (
1791 over_1x64 (d, expand_alpha_1x64 (d),
1792 pix_multiply_1x64 (unpack_32_1x64 (src),
1793 unpack_32_1x64 (mask))));
1796 static force_inline void
1797 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1804 __m128i xmm_alpha_lo, xmm_alpha_hi;
1805 __m128i xmm_src_lo, xmm_src_hi;
1806 __m128i xmm_dst_lo, xmm_dst_hi;
1807 __m128i xmm_mask_lo, xmm_mask_hi;
1809 /* call prefetch hint to optimize cache load*/
1810 cache_prefetch ((__m128i*)ps);
1811 cache_prefetch ((__m128i*)pd);
1812 cache_prefetch ((__m128i*)pm);
1814 while (w && (unsigned long)pd & 15)
1820 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1824 /* call prefetch hint to optimize cache load*/
1825 cache_prefetch ((__m128i*)ps);
1826 cache_prefetch ((__m128i*)pd);
1827 cache_prefetch ((__m128i*)pm);
1831 /* fill cache line with next memory */
1832 cache_prefetch_next ((__m128i*)ps);
1833 cache_prefetch_next ((__m128i*)pd);
1834 cache_prefetch_next ((__m128i*)pm);
1836 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1837 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1838 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1840 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1841 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1842 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1844 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1845 &xmm_alpha_lo, &xmm_alpha_hi);
1846 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1847 &xmm_mask_lo, &xmm_mask_hi,
1848 &xmm_mask_lo, &xmm_mask_hi);
1850 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1851 &xmm_alpha_lo, &xmm_alpha_hi,
1852 &xmm_mask_lo, &xmm_mask_hi);
1855 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1869 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1874 static force_inline void
1875 core_combine_in_ca_sse2 (uint32_t * pd,
1882 __m128i xmm_alpha_lo, xmm_alpha_hi;
1883 __m128i xmm_src_lo, xmm_src_hi;
1884 __m128i xmm_dst_lo, xmm_dst_hi;
1885 __m128i xmm_mask_lo, xmm_mask_hi;
1887 /* call prefetch hint to optimize cache load*/
1888 cache_prefetch ((__m128i*)ps);
1889 cache_prefetch ((__m128i*)pd);
1890 cache_prefetch ((__m128i*)pm);
1892 while (w && (unsigned long)pd & 15)
1898 *pd++ = pack_1x64_32 (
1900 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1901 expand_alpha_1x64 (unpack_32_1x64 (d))));
1906 /* call prefetch hint to optimize cache load*/
1907 cache_prefetch ((__m128i*)ps);
1908 cache_prefetch ((__m128i*)pd);
1909 cache_prefetch ((__m128i*)pm);
1913 /* fill cache line with next memory */
1914 cache_prefetch_next ((__m128i*)ps);
1915 cache_prefetch_next ((__m128i*)pd);
1916 cache_prefetch_next ((__m128i*)pm);
1918 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1919 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1920 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1922 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1923 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1924 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1926 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1927 &xmm_alpha_lo, &xmm_alpha_hi);
1929 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1930 &xmm_mask_lo, &xmm_mask_hi,
1931 &xmm_dst_lo, &xmm_dst_hi);
1933 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1934 &xmm_alpha_lo, &xmm_alpha_hi,
1935 &xmm_dst_lo, &xmm_dst_hi);
1938 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1952 *pd++ = pack_1x64_32 (
1955 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1956 expand_alpha_1x64 (unpack_32_1x64 (d))));
1962 static force_inline void
1963 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1970 __m128i xmm_alpha_lo, xmm_alpha_hi;
1971 __m128i xmm_src_lo, xmm_src_hi;
1972 __m128i xmm_dst_lo, xmm_dst_hi;
1973 __m128i xmm_mask_lo, xmm_mask_hi;
1975 /* call prefetch hint to optimize cache load*/
1976 cache_prefetch ((__m128i*)ps);
1977 cache_prefetch ((__m128i*)pd);
1978 cache_prefetch ((__m128i*)pm);
1980 while (w && (unsigned long)pd & 15)
1986 *pd++ = pack_1x64_32 (
1989 pix_multiply_1x64 (unpack_32_1x64 (m),
1990 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1994 /* call prefetch hint to optimize cache load*/
1995 cache_prefetch ((__m128i*)ps);
1996 cache_prefetch ((__m128i*)pd);
1997 cache_prefetch ((__m128i*)pm);
2001 /* fill cache line with next memory */
2002 cache_prefetch_next ((__m128i*)ps);
2003 cache_prefetch_next ((__m128i*)pd);
2004 cache_prefetch_next ((__m128i*)pm);
2006 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2007 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2008 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2010 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2011 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2012 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2014 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2015 &xmm_alpha_lo, &xmm_alpha_hi);
2016 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2017 &xmm_alpha_lo, &xmm_alpha_hi,
2018 &xmm_alpha_lo, &xmm_alpha_hi);
2020 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2021 &xmm_alpha_lo, &xmm_alpha_hi,
2022 &xmm_dst_lo, &xmm_dst_hi);
2025 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2039 *pd++ = pack_1x64_32 (
2042 pix_multiply_1x64 (unpack_32_1x64 (m),
2043 expand_alpha_1x64 (unpack_32_1x64 (s)))));
2048 static force_inline void
2049 core_combine_out_ca_sse2 (uint32_t * pd,
2056 __m128i xmm_alpha_lo, xmm_alpha_hi;
2057 __m128i xmm_src_lo, xmm_src_hi;
2058 __m128i xmm_dst_lo, xmm_dst_hi;
2059 __m128i xmm_mask_lo, xmm_mask_hi;
2061 /* call prefetch hint to optimize cache load*/
2062 cache_prefetch ((__m128i*)ps);
2063 cache_prefetch ((__m128i*)pd);
2064 cache_prefetch ((__m128i*)pm);
2066 while (w && (unsigned long)pd & 15)
2072 *pd++ = pack_1x64_32 (
2075 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2076 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2080 /* call prefetch hint to optimize cache load*/
2081 cache_prefetch ((__m128i*)ps);
2082 cache_prefetch ((__m128i*)pd);
2083 cache_prefetch ((__m128i*)pm);
2087 /* fill cache line with next memory */
2088 cache_prefetch_next ((__m128i*)ps);
2089 cache_prefetch_next ((__m128i*)pd);
2090 cache_prefetch_next ((__m128i*)pm);
2092 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2093 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2094 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2096 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2097 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2098 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2100 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2101 &xmm_alpha_lo, &xmm_alpha_hi);
2102 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2103 &xmm_alpha_lo, &xmm_alpha_hi);
2105 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2106 &xmm_mask_lo, &xmm_mask_hi,
2107 &xmm_dst_lo, &xmm_dst_hi);
2108 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2109 &xmm_alpha_lo, &xmm_alpha_hi,
2110 &xmm_dst_lo, &xmm_dst_hi);
2113 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2127 *pd++ = pack_1x64_32 (
2130 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2131 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2137 static force_inline void
2138 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
2145 __m128i xmm_alpha_lo, xmm_alpha_hi;
2146 __m128i xmm_src_lo, xmm_src_hi;
2147 __m128i xmm_dst_lo, xmm_dst_hi;
2148 __m128i xmm_mask_lo, xmm_mask_hi;
2150 /* call prefetch hint to optimize cache load*/
2151 cache_prefetch ((__m128i*)ps);
2152 cache_prefetch ((__m128i*)pd);
2153 cache_prefetch ((__m128i*)pm);
2155 while (w && (unsigned long)pd & 15)
2161 *pd++ = pack_1x64_32 (
2164 negate_1x64 (pix_multiply_1x64 (
2166 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2170 /* call prefetch hint to optimize cache load*/
2171 cache_prefetch ((__m128i*)ps);
2172 cache_prefetch ((__m128i*)pd);
2173 cache_prefetch ((__m128i*)pm);
2177 /* fill cache line with next memory */
2178 cache_prefetch_next ((__m128i*)ps);
2179 cache_prefetch_next ((__m128i*)pd);
2180 cache_prefetch_next ((__m128i*)pm);
2182 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2183 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2184 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2186 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2187 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2188 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2190 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2191 &xmm_alpha_lo, &xmm_alpha_hi);
2193 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2194 &xmm_alpha_lo, &xmm_alpha_hi,
2195 &xmm_mask_lo, &xmm_mask_hi);
2197 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2198 &xmm_mask_lo, &xmm_mask_hi);
2200 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2201 &xmm_mask_lo, &xmm_mask_hi,
2202 &xmm_dst_lo, &xmm_dst_hi);
2205 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2219 *pd++ = pack_1x64_32 (
2222 negate_1x64 (pix_multiply_1x64 (
2224 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2229 static force_inline uint32_t
2230 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2234 __m64 m = unpack_32_1x64 (mask);
2235 __m64 s = unpack_32_1x64 (src);
2236 __m64 d = unpack_32_1x64 (dst);
2237 __m64 sa = expand_alpha_1x64 (s);
2238 __m64 da = expand_alpha_1x64 (d);
2240 s = pix_multiply_1x64 (s, m);
2241 m = negate_1x64 (pix_multiply_1x64 (m, sa));
2243 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2246 static force_inline void
2247 core_combine_atop_ca_sse2 (uint32_t * pd,
2254 __m128i xmm_src_lo, xmm_src_hi;
2255 __m128i xmm_dst_lo, xmm_dst_hi;
2256 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2257 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2258 __m128i xmm_mask_lo, xmm_mask_hi;
2260 /* call prefetch hint to optimize cache load*/
2261 cache_prefetch ((__m128i*)ps);
2262 cache_prefetch ((__m128i*)pd);
2263 cache_prefetch ((__m128i*)pm);
2265 while (w && (unsigned long)pd & 15)
2271 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2275 /* call prefetch hint to optimize cache load*/
2276 cache_prefetch ((__m128i*)ps);
2277 cache_prefetch ((__m128i*)pd);
2278 cache_prefetch ((__m128i*)pm);
2282 /* fill cache line with next memory */
2283 cache_prefetch_next ((__m128i*)ps);
2284 cache_prefetch_next ((__m128i*)pd);
2285 cache_prefetch_next ((__m128i*)pm);
2287 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2288 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2289 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2291 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2292 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2293 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2295 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2296 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2297 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2298 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2300 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2301 &xmm_mask_lo, &xmm_mask_hi,
2302 &xmm_src_lo, &xmm_src_hi);
2303 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2304 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2305 &xmm_mask_lo, &xmm_mask_hi);
2307 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2309 pix_add_multiply_2x128 (
2310 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2311 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2312 &xmm_dst_lo, &xmm_dst_hi);
2315 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2329 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2334 static force_inline uint32_t
2335 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2339 __m64 m = unpack_32_1x64 (mask);
2340 __m64 s = unpack_32_1x64 (src);
2341 __m64 d = unpack_32_1x64 (dst);
2343 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2344 __m64 sa = expand_alpha_1x64 (s);
2346 s = pix_multiply_1x64 (s, m);
2347 m = pix_multiply_1x64 (m, sa);
2349 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2352 static force_inline void
2353 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2360 __m128i xmm_src_lo, xmm_src_hi;
2361 __m128i xmm_dst_lo, xmm_dst_hi;
2362 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2363 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2364 __m128i xmm_mask_lo, xmm_mask_hi;
2366 /* call prefetch hint to optimize cache load*/
2367 cache_prefetch ((__m128i*)ps);
2368 cache_prefetch ((__m128i*)pd);
2369 cache_prefetch ((__m128i*)pm);
2371 while (w && (unsigned long)pd & 15)
2377 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2381 /* call prefetch hint to optimize cache load*/
2382 cache_prefetch ((__m128i*)ps);
2383 cache_prefetch ((__m128i*)pd);
2384 cache_prefetch ((__m128i*)pm);
2388 /* fill cache line with next memory */
2389 cache_prefetch_next ((__m128i*)ps);
2390 cache_prefetch_next ((__m128i*)pd);
2391 cache_prefetch_next ((__m128i*)pm);
2393 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2394 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2395 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2397 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2398 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2399 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2401 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2402 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2403 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2404 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2406 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2407 &xmm_mask_lo, &xmm_mask_hi,
2408 &xmm_src_lo, &xmm_src_hi);
2409 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2410 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2411 &xmm_mask_lo, &xmm_mask_hi);
2413 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2414 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2416 pix_add_multiply_2x128 (
2417 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2418 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2419 &xmm_dst_lo, &xmm_dst_hi);
2422 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2436 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2441 static force_inline uint32_t
2442 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2446 __m64 a = unpack_32_1x64 (mask);
2447 __m64 s = unpack_32_1x64 (src);
2448 __m64 d = unpack_32_1x64 (dst);
2450 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2451 a, expand_alpha_1x64 (s)));
2452 __m64 dest = pix_multiply_1x64 (s, a);
2453 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2455 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2461 static force_inline void
2462 core_combine_xor_ca_sse2 (uint32_t * pd,
2469 __m128i xmm_src_lo, xmm_src_hi;
2470 __m128i xmm_dst_lo, xmm_dst_hi;
2471 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2472 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2473 __m128i xmm_mask_lo, xmm_mask_hi;
2475 /* call prefetch hint to optimize cache load*/
2476 cache_prefetch ((__m128i*)ps);
2477 cache_prefetch ((__m128i*)pd);
2478 cache_prefetch ((__m128i*)pm);
2480 while (w && (unsigned long)pd & 15)
2486 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2490 /* call prefetch hint to optimize cache load*/
2491 cache_prefetch ((__m128i*)ps);
2492 cache_prefetch ((__m128i*)pd);
2493 cache_prefetch ((__m128i*)pm);
2497 /* fill cache line with next memory */
2498 cache_prefetch_next ((__m128i*)ps);
2499 cache_prefetch_next ((__m128i*)pd);
2500 cache_prefetch_next ((__m128i*)pm);
2502 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2503 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2504 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2506 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2507 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2508 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2510 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2511 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2512 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2513 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2515 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2516 &xmm_mask_lo, &xmm_mask_hi,
2517 &xmm_src_lo, &xmm_src_hi);
2518 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2519 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2520 &xmm_mask_lo, &xmm_mask_hi);
2522 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2523 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2524 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2525 &xmm_mask_lo, &xmm_mask_hi);
2527 pix_add_multiply_2x128 (
2528 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2529 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2530 &xmm_dst_lo, &xmm_dst_hi);
2533 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2547 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2552 static force_inline void
2553 core_combine_add_ca_sse2 (uint32_t * pd,
2560 __m128i xmm_src_lo, xmm_src_hi;
2561 __m128i xmm_dst_lo, xmm_dst_hi;
2562 __m128i xmm_mask_lo, xmm_mask_hi;
2564 /* call prefetch hint to optimize cache load*/
2565 cache_prefetch ((__m128i*)ps);
2566 cache_prefetch ((__m128i*)pd);
2567 cache_prefetch ((__m128i*)pm);
2569 while (w && (unsigned long)pd & 15)
2575 *pd++ = pack_1x64_32 (
2576 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2577 unpack_32_1x64 (m)),
2578 unpack_32_1x64 (d)));
2582 /* call prefetch hint to optimize cache load*/
2583 cache_prefetch ((__m128i*)ps);
2584 cache_prefetch ((__m128i*)pd);
2585 cache_prefetch ((__m128i*)pm);
2589 /* fill cache line with next memory */
2590 cache_prefetch_next ((__m128i*)ps);
2591 cache_prefetch_next ((__m128i*)pd);
2592 cache_prefetch_next ((__m128i*)pm);
2594 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2595 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2596 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2598 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2599 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2600 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2602 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2603 &xmm_mask_lo, &xmm_mask_hi,
2604 &xmm_src_lo, &xmm_src_hi);
2607 (__m128i*)pd, pack_2x128_128 (
2608 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2609 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2623 *pd++ = pack_1x64_32 (
2624 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2625 unpack_32_1x64 (m)),
2626 unpack_32_1x64 (d)));
2631 /* ---------------------------------------------------
2632 * fb_compose_setup_sSE2
2634 static force_inline __m64
2635 create_mask_16_64 (uint16_t mask)
2637 return _mm_set1_pi16 (mask);
2640 static force_inline __m128i
2641 create_mask_16_128 (uint16_t mask)
2643 return _mm_set1_epi16 (mask);
2646 static force_inline __m64
2647 create_mask_2x32_64 (uint32_t mask0,
2650 return _mm_set_pi32 (mask0, mask1);
2653 /* Work around a code generation bug in Sun Studio 12. */
2654 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2655 # define create_mask_2x32_128(mask0, mask1) \
2656 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2658 static force_inline __m128i
2659 create_mask_2x32_128 (uint32_t mask0,
2662 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2666 /* SSE2 code patch for fbcompose.c */
2669 sse2_combine_over_u (pixman_implementation_t *imp,
2672 const uint32_t * src,
2673 const uint32_t * mask,
2676 core_combine_over_u_sse2 (dst, src, mask, width);
2681 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2684 const uint32_t * src,
2685 const uint32_t * mask,
2688 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2693 sse2_combine_in_u (pixman_implementation_t *imp,
2696 const uint32_t * src,
2697 const uint32_t * mask,
2700 core_combine_in_u_sse2 (dst, src, mask, width);
2705 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2708 const uint32_t * src,
2709 const uint32_t * mask,
2712 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2717 sse2_combine_out_u (pixman_implementation_t *imp,
2720 const uint32_t * src,
2721 const uint32_t * mask,
2724 core_combine_out_u_sse2 (dst, src, mask, width);
2729 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2732 const uint32_t * src,
2733 const uint32_t * mask,
2736 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2741 sse2_combine_atop_u (pixman_implementation_t *imp,
2744 const uint32_t * src,
2745 const uint32_t * mask,
2748 core_combine_atop_u_sse2 (dst, src, mask, width);
2753 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2756 const uint32_t * src,
2757 const uint32_t * mask,
2760 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2765 sse2_combine_xor_u (pixman_implementation_t *imp,
2768 const uint32_t * src,
2769 const uint32_t * mask,
2772 core_combine_xor_u_sse2 (dst, src, mask, width);
2777 sse2_combine_add_u (pixman_implementation_t *imp,
2780 const uint32_t * src,
2781 const uint32_t * mask,
2784 core_combine_add_u_sse2 (dst, src, mask, width);
2789 sse2_combine_saturate_u (pixman_implementation_t *imp,
2792 const uint32_t * src,
2793 const uint32_t * mask,
2796 core_combine_saturate_u_sse2 (dst, src, mask, width);
2801 sse2_combine_src_ca (pixman_implementation_t *imp,
2804 const uint32_t * src,
2805 const uint32_t * mask,
2808 core_combine_src_ca_sse2 (dst, src, mask, width);
2813 sse2_combine_over_ca (pixman_implementation_t *imp,
2816 const uint32_t * src,
2817 const uint32_t * mask,
2820 core_combine_over_ca_sse2 (dst, src, mask, width);
2825 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2828 const uint32_t * src,
2829 const uint32_t * mask,
2832 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2837 sse2_combine_in_ca (pixman_implementation_t *imp,
2840 const uint32_t * src,
2841 const uint32_t * mask,
2844 core_combine_in_ca_sse2 (dst, src, mask, width);
2849 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2852 const uint32_t * src,
2853 const uint32_t * mask,
2856 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2861 sse2_combine_out_ca (pixman_implementation_t *imp,
2864 const uint32_t * src,
2865 const uint32_t * mask,
2868 core_combine_out_ca_sse2 (dst, src, mask, width);
2873 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2876 const uint32_t * src,
2877 const uint32_t * mask,
2880 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2885 sse2_combine_atop_ca (pixman_implementation_t *imp,
2888 const uint32_t * src,
2889 const uint32_t * mask,
2892 core_combine_atop_ca_sse2 (dst, src, mask, width);
2897 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2900 const uint32_t * src,
2901 const uint32_t * mask,
2904 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2909 sse2_combine_xor_ca (pixman_implementation_t *imp,
2912 const uint32_t * src,
2913 const uint32_t * mask,
2916 core_combine_xor_ca_sse2 (dst, src, mask, width);
2921 sse2_combine_add_ca (pixman_implementation_t *imp,
2924 const uint32_t * src,
2925 const uint32_t * mask,
2928 core_combine_add_ca_sse2 (dst, src, mask, width);
2932 /* -------------------------------------------------------------------
2933 * composite_over_n_8888
2937 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2939 pixman_image_t * src_image,
2940 pixman_image_t * mask_image,
2941 pixman_image_t * dst_image,
2952 uint32_t *dst_line, *dst, d;
2955 __m128i xmm_src, xmm_alpha;
2956 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2958 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2963 PIXMAN_IMAGE_GET_LINE (
2964 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2966 xmm_src = expand_pixel_32_1x128 (src);
2967 xmm_alpha = expand_alpha_1x128 (xmm_src);
2973 /* call prefetch hint to optimize cache load*/
2974 cache_prefetch ((__m128i*)dst);
2976 dst_line += dst_stride;
2979 while (w && (unsigned long)dst & 15)
2982 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2983 _mm_movepi64_pi64 (xmm_alpha),
2984 unpack_32_1x64 (d)));
2988 cache_prefetch ((__m128i*)dst);
2992 /* fill cache line with next memory */
2993 cache_prefetch_next ((__m128i*)dst);
2995 xmm_dst = load_128_aligned ((__m128i*)dst);
2997 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2999 over_2x128 (&xmm_src, &xmm_src,
3000 &xmm_alpha, &xmm_alpha,
3001 &xmm_dst_lo, &xmm_dst_hi);
3003 /* rebuid the 4 pixel data and save*/
3005 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3014 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3015 _mm_movepi64_pi64 (xmm_alpha),
3016 unpack_32_1x64 (d)));
3024 /* ---------------------------------------------------------------------
3025 * composite_over_n_0565
3028 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3030 pixman_image_t * src_image,
3031 pixman_image_t * mask_image,
3032 pixman_image_t * dst_image,
3043 uint16_t *dst_line, *dst, d;
3046 __m128i xmm_src, xmm_alpha;
3047 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3049 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3054 PIXMAN_IMAGE_GET_LINE (
3055 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3057 xmm_src = expand_pixel_32_1x128 (src);
3058 xmm_alpha = expand_alpha_1x128 (xmm_src);
3064 /* call prefetch hint to optimize cache load*/
3065 cache_prefetch ((__m128i*)dst);
3067 dst_line += dst_stride;
3070 while (w && (unsigned long)dst & 15)
3074 *dst++ = pack_565_32_16 (
3075 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3076 _mm_movepi64_pi64 (xmm_alpha),
3077 expand565_16_1x64 (d))));
3081 /* call prefetch hint to optimize cache load*/
3082 cache_prefetch ((__m128i*)dst);
3086 /* fill cache line with next memory */
3087 cache_prefetch_next ((__m128i*)dst);
3089 xmm_dst = load_128_aligned ((__m128i*)dst);
3091 unpack_565_128_4x128 (xmm_dst,
3092 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3094 over_2x128 (&xmm_src, &xmm_src,
3095 &xmm_alpha, &xmm_alpha,
3096 &xmm_dst0, &xmm_dst1);
3097 over_2x128 (&xmm_src, &xmm_src,
3098 &xmm_alpha, &xmm_alpha,
3099 &xmm_dst2, &xmm_dst3);
3101 xmm_dst = pack_565_4x128_128 (
3102 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3104 save_128_aligned ((__m128i*)dst, xmm_dst);
3113 *dst++ = pack_565_32_16 (
3114 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3115 _mm_movepi64_pi64 (xmm_alpha),
3116 expand565_16_1x64 (d))));
3123 /* ------------------------------
3124 * composite_add_n_8888_8888_ca
3127 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3129 pixman_image_t * src_image,
3130 pixman_image_t * mask_image,
3131 pixman_image_t * dst_image,
3142 uint32_t *dst_line, d;
3143 uint32_t *mask_line, m;
3145 int dst_stride, mask_stride;
3147 __m128i xmm_src, xmm_alpha;
3149 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3151 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3153 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3159 PIXMAN_IMAGE_GET_LINE (
3160 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3161 PIXMAN_IMAGE_GET_LINE (
3162 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3164 xmm_src = _mm_unpacklo_epi8 (
3165 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3166 xmm_alpha = expand_alpha_1x128 (xmm_src);
3167 mmx_src = _mm_movepi64_pi64 (xmm_src);
3168 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3173 const uint32_t *pm = (uint32_t *)mask_line;
3174 uint32_t *pd = (uint32_t *)dst_line;
3176 dst_line += dst_stride;
3177 mask_line += mask_stride;
3179 /* call prefetch hint to optimize cache load*/
3180 cache_prefetch ((__m128i*)pd);
3181 cache_prefetch ((__m128i*)pm);
3183 while (w && (unsigned long)pd & 15)
3191 mmx_mask = unpack_32_1x64 (m);
3192 mmx_dest = unpack_32_1x64 (d);
3194 *pd = pack_1x64_32 (
3195 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3202 /* call prefetch hint to optimize cache load*/
3203 cache_prefetch ((__m128i*)pd);
3204 cache_prefetch ((__m128i*)pm);
3208 /* fill cache line with next memory */
3209 cache_prefetch_next ((__m128i*)pd);
3210 cache_prefetch_next ((__m128i*)pm);
3212 xmm_mask = load_128_unaligned ((__m128i*)pm);
3216 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3218 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3219 if (pack_cmp != 0xffff)
3221 xmm_dst = load_128_aligned ((__m128i*)pd);
3223 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3225 pix_multiply_2x128 (&xmm_src, &xmm_src,
3226 &xmm_mask_lo, &xmm_mask_hi,
3227 &xmm_mask_lo, &xmm_mask_hi);
3228 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3231 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3247 mmx_mask = unpack_32_1x64 (m);
3248 mmx_dest = unpack_32_1x64 (d);
3250 *pd = pack_1x64_32 (
3251 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3262 /* ---------------------------------------------------------------------------
3263 * composite_over_n_8888_8888_ca
3267 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3269 pixman_image_t * src_image,
3270 pixman_image_t * mask_image,
3271 pixman_image_t * dst_image,
3282 uint32_t *dst_line, d;
3283 uint32_t *mask_line, m;
3285 int dst_stride, mask_stride;
3287 __m128i xmm_src, xmm_alpha;
3288 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3289 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3291 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3293 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3298 PIXMAN_IMAGE_GET_LINE (
3299 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3300 PIXMAN_IMAGE_GET_LINE (
3301 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3303 xmm_src = _mm_unpacklo_epi8 (
3304 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3305 xmm_alpha = expand_alpha_1x128 (xmm_src);
3306 mmx_src = _mm_movepi64_pi64 (xmm_src);
3307 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3312 const uint32_t *pm = (uint32_t *)mask_line;
3313 uint32_t *pd = (uint32_t *)dst_line;
3315 dst_line += dst_stride;
3316 mask_line += mask_stride;
3318 /* call prefetch hint to optimize cache load*/
3319 cache_prefetch ((__m128i*)pd);
3320 cache_prefetch ((__m128i*)pm);
3322 while (w && (unsigned long)pd & 15)
3329 mmx_mask = unpack_32_1x64 (m);
3330 mmx_dest = unpack_32_1x64 (d);
3332 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3342 /* call prefetch hint to optimize cache load*/
3343 cache_prefetch ((__m128i*)pd);
3344 cache_prefetch ((__m128i*)pm);
3348 /* fill cache line with next memory */
3349 cache_prefetch_next ((__m128i*)pd);
3350 cache_prefetch_next ((__m128i*)pm);
3352 xmm_mask = load_128_unaligned ((__m128i*)pm);
3356 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3358 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3359 if (pack_cmp != 0xffff)
3361 xmm_dst = load_128_aligned ((__m128i*)pd);
3363 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3364 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3366 in_over_2x128 (&xmm_src, &xmm_src,
3367 &xmm_alpha, &xmm_alpha,
3368 &xmm_mask_lo, &xmm_mask_hi,
3369 &xmm_dst_lo, &xmm_dst_hi);
3372 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3387 mmx_mask = unpack_32_1x64 (m);
3388 mmx_dest = unpack_32_1x64 (d);
3390 *pd = pack_1x64_32 (
3391 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3402 /*---------------------------------------------------------------------
3403 * composite_over_8888_n_8888
3407 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3409 pixman_image_t * src_image,
3410 pixman_image_t * mask_image,
3411 pixman_image_t * dst_image,
3421 uint32_t *dst_line, *dst;
3422 uint32_t *src_line, *src;
3425 int dst_stride, src_stride;
3428 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3429 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3430 __m128i xmm_alpha_lo, xmm_alpha_hi;
3432 PIXMAN_IMAGE_GET_LINE (
3433 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3434 PIXMAN_IMAGE_GET_LINE (
3435 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3437 mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3439 xmm_mask = create_mask_16_128 (mask >> 24);
3444 dst_line += dst_stride;
3446 src_line += src_stride;
3449 /* call prefetch hint to optimize cache load*/
3450 cache_prefetch ((__m128i*)dst);
3451 cache_prefetch ((__m128i*)src);
3453 while (w && (unsigned long)dst & 15)
3455 uint32_t s = *src++;
3458 __m64 ms = unpack_32_1x64 (s);
3459 __m64 alpha = expand_alpha_1x64 (ms);
3460 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3461 __m64 alpha_dst = unpack_32_1x64 (d);
3463 *dst++ = pack_1x64_32 (
3464 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3469 /* call prefetch hint to optimize cache load*/
3470 cache_prefetch ((__m128i*)dst);
3471 cache_prefetch ((__m128i*)src);
3475 /* fill cache line with next memory */
3476 cache_prefetch_next ((__m128i*)dst);
3477 cache_prefetch_next ((__m128i*)src);
3479 xmm_src = load_128_unaligned ((__m128i*)src);
3480 xmm_dst = load_128_aligned ((__m128i*)dst);
3482 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3483 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3484 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3485 &xmm_alpha_lo, &xmm_alpha_hi);
3487 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3488 &xmm_alpha_lo, &xmm_alpha_hi,
3489 &xmm_mask, &xmm_mask,
3490 &xmm_dst_lo, &xmm_dst_hi);
3493 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3502 uint32_t s = *src++;
3505 __m64 ms = unpack_32_1x64 (s);
3506 __m64 alpha = expand_alpha_1x64 (ms);
3507 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3508 __m64 dest = unpack_32_1x64 (d);
3510 *dst++ = pack_1x64_32 (
3511 in_over_1x64 (&ms, &alpha, &mask, &dest));
3520 /* ---------------------------------------------------------------------
3521 * composite_over_x888_n_8888
3524 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3526 pixman_image_t * src_image,
3527 pixman_image_t * mask_image,
3528 pixman_image_t * dst_image,
3538 uint32_t *dst_line, *dst;
3539 uint32_t *src_line, *src;
3541 int dst_stride, src_stride;
3544 __m128i xmm_mask, xmm_alpha;
3545 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3546 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3548 PIXMAN_IMAGE_GET_LINE (
3549 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3550 PIXMAN_IMAGE_GET_LINE (
3551 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3553 mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
3555 xmm_mask = create_mask_16_128 (mask >> 24);
3556 xmm_alpha = mask_00ff;
3561 dst_line += dst_stride;
3563 src_line += src_stride;
3566 /* call prefetch hint to optimize cache load*/
3567 cache_prefetch ((__m128i*)dst);
3568 cache_prefetch ((__m128i*)src);
3570 while (w && (unsigned long)dst & 15)
3572 uint32_t s = (*src++) | 0xff000000;
3575 __m64 src = unpack_32_1x64 (s);
3576 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3577 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3578 __m64 dest = unpack_32_1x64 (d);
3580 *dst++ = pack_1x64_32 (
3581 in_over_1x64 (&src, &alpha, &mask, &dest));
3586 /* call prefetch hint to optimize cache load*/
3587 cache_prefetch ((__m128i*)dst);
3588 cache_prefetch ((__m128i*)src);
3592 /* fill cache line with next memory */
3593 cache_prefetch_next ((__m128i*)dst);
3594 cache_prefetch_next ((__m128i*)src);
3596 xmm_src = _mm_or_si128 (
3597 load_128_unaligned ((__m128i*)src), mask_ff000000);
3598 xmm_dst = load_128_aligned ((__m128i*)dst);
3600 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3601 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3603 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3604 &xmm_alpha, &xmm_alpha,
3605 &xmm_mask, &xmm_mask,
3606 &xmm_dst_lo, &xmm_dst_hi);
3609 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3619 uint32_t s = (*src++) | 0xff000000;
3622 __m64 src = unpack_32_1x64 (s);
3623 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3624 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3625 __m64 dest = unpack_32_1x64 (d);
3627 *dst++ = pack_1x64_32 (
3628 in_over_1x64 (&src, &alpha, &mask, &dest));
3637 /* --------------------------------------------------------------------
3638 * composite_over_8888_8888
3641 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3643 pixman_image_t * src_image,
3644 pixman_image_t * mask_image,
3645 pixman_image_t * dst_image,
3655 int dst_stride, src_stride;
3656 uint32_t *dst_line, *dst;
3657 uint32_t *src_line, *src;
3659 PIXMAN_IMAGE_GET_LINE (
3660 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3661 PIXMAN_IMAGE_GET_LINE (
3662 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3669 core_combine_over_u_sse2 (dst, src, NULL, width);
3677 /* ------------------------------------------------------------------
3678 * composite_over_8888_0565
3680 static force_inline uint16_t
3681 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3685 ms = unpack_32_1x64 (src);
3686 return pack_565_32_16 (
3689 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3693 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3695 pixman_image_t * src_image,
3696 pixman_image_t * mask_image,
3697 pixman_image_t * dst_image,
3707 uint16_t *dst_line, *dst, d;
3708 uint32_t *src_line, *src, s;
3709 int dst_stride, src_stride;
3712 __m128i xmm_alpha_lo, xmm_alpha_hi;
3713 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3714 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3716 PIXMAN_IMAGE_GET_LINE (
3717 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3718 PIXMAN_IMAGE_GET_LINE (
3719 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3724 * I copy the code from MMX one and keep the fixme.
3725 * If it's a problem there, probably is a problem here.
3727 assert (src_image->drawable == mask_image->drawable);
3735 /* call prefetch hint to optimize cache load*/
3736 cache_prefetch ((__m128i*)src);
3737 cache_prefetch ((__m128i*)dst);
3739 dst_line += dst_stride;
3740 src_line += src_stride;
3743 /* Align dst on a 16-byte boundary */
3745 ((unsigned long)dst & 15))
3750 *dst++ = composite_over_8888_0565pixel (s, d);
3754 /* call prefetch hint to optimize cache load*/
3755 cache_prefetch ((__m128i*)src);
3756 cache_prefetch ((__m128i*)dst);
3758 /* It's a 8 pixel loop */
3761 /* fill cache line with next memory */
3762 cache_prefetch_next ((__m128i*)src);
3763 cache_prefetch_next ((__m128i*)dst);
3765 /* I'm loading unaligned because I'm not sure
3766 * about the address alignment.
3768 xmm_src = load_128_unaligned ((__m128i*) src);
3769 xmm_dst = load_128_aligned ((__m128i*) dst);
3772 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3773 unpack_565_128_4x128 (xmm_dst,
3774 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3775 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3776 &xmm_alpha_lo, &xmm_alpha_hi);
3778 /* I'm loading next 4 pixels from memory
3779 * before to optimze the memory read.
3781 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3783 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3784 &xmm_alpha_lo, &xmm_alpha_hi,
3785 &xmm_dst0, &xmm_dst1);
3788 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3789 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3790 &xmm_alpha_lo, &xmm_alpha_hi);
3792 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3793 &xmm_alpha_lo, &xmm_alpha_hi,
3794 &xmm_dst2, &xmm_dst3);
3797 (__m128i*)dst, pack_565_4x128_128 (
3798 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3810 *dst++ = composite_over_8888_0565pixel (s, d);
3817 /* -----------------------------------------------------------------
3818 * composite_over_n_8_8888
3822 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3824 pixman_image_t * src_image,
3825 pixman_image_t * mask_image,
3826 pixman_image_t * dst_image,
3837 uint32_t *dst_line, *dst;
3838 uint8_t *mask_line, *mask;
3839 int dst_stride, mask_stride;
3843 __m128i xmm_src, xmm_alpha, xmm_def;
3844 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3845 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3847 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3849 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3855 PIXMAN_IMAGE_GET_LINE (
3856 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3857 PIXMAN_IMAGE_GET_LINE (
3858 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3860 xmm_def = create_mask_2x32_128 (src, src);
3861 xmm_src = expand_pixel_32_1x128 (src);
3862 xmm_alpha = expand_alpha_1x128 (xmm_src);
3863 mmx_src = _mm_movepi64_pi64 (xmm_src);
3864 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3869 dst_line += dst_stride;
3871 mask_line += mask_stride;
3874 /* call prefetch hint to optimize cache load*/
3875 cache_prefetch ((__m128i*)mask);
3876 cache_prefetch ((__m128i*)dst);
3878 while (w && (unsigned long)dst & 15)
3880 uint8_t m = *mask++;
3885 mmx_mask = expand_pixel_8_1x64 (m);
3886 mmx_dest = unpack_32_1x64 (d);
3888 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3898 /* call prefetch hint to optimize cache load*/
3899 cache_prefetch ((__m128i*)mask);
3900 cache_prefetch ((__m128i*)dst);
3904 /* fill cache line with next memory */
3905 cache_prefetch_next ((__m128i*)mask);
3906 cache_prefetch_next ((__m128i*)dst);
3908 m = *((uint32_t*)mask);
3910 if (srca == 0xff && m == 0xffffffff)
3912 save_128_aligned ((__m128i*)dst, xmm_def);
3916 xmm_dst = load_128_aligned ((__m128i*) dst);
3917 xmm_mask = unpack_32_1x128 (m);
3918 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3921 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3922 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3924 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3925 &xmm_mask_lo, &xmm_mask_hi);
3927 in_over_2x128 (&xmm_src, &xmm_src,
3928 &xmm_alpha, &xmm_alpha,
3929 &xmm_mask_lo, &xmm_mask_hi,
3930 &xmm_dst_lo, &xmm_dst_hi);
3933 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3943 uint8_t m = *mask++;
3948 mmx_mask = expand_pixel_8_1x64 (m);
3949 mmx_dest = unpack_32_1x64 (d);
3951 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3965 /* ----------------------------------------------------------------
3966 * composite_over_n_8_8888
3970 pixman_fill_sse2 (uint32_t *bits,
3979 uint32_t byte_width;
3984 if (bpp != 16 && bpp != 32)
3989 stride = stride * (int) sizeof (uint32_t) / 2;
3990 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3991 byte_width = 2 * width;
3993 data = (data & 0xffff) * 0x00010001;
3997 stride = stride * (int) sizeof (uint32_t) / 4;
3998 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3999 byte_width = 4 * width;
4003 cache_prefetch ((__m128i*)byte_line);
4004 xmm_def = create_mask_2x32_128 (data, data);
4009 uint8_t *d = byte_line;
4010 byte_line += stride;
4014 cache_prefetch_next ((__m128i*)d);
4016 while (w >= 2 && ((unsigned long)d & 3))
4018 *(uint16_t *)d = data;
4023 while (w >= 4 && ((unsigned long)d & 15))
4025 *(uint32_t *)d = data;
4031 cache_prefetch_next ((__m128i*)d);
4035 cache_prefetch (((__m128i*)d) + 12);
4037 save_128_aligned ((__m128i*)(d), xmm_def);
4038 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4039 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4040 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4041 save_128_aligned ((__m128i*)(d + 64), xmm_def);
4042 save_128_aligned ((__m128i*)(d + 80), xmm_def);
4043 save_128_aligned ((__m128i*)(d + 96), xmm_def);
4044 save_128_aligned ((__m128i*)(d + 112), xmm_def);
4052 cache_prefetch (((__m128i*)d) + 8);
4054 save_128_aligned ((__m128i*)(d), xmm_def);
4055 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4056 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4057 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4063 cache_prefetch_next ((__m128i*)d);
4067 save_128_aligned ((__m128i*)(d), xmm_def);
4068 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4076 save_128_aligned ((__m128i*)(d), xmm_def);
4082 cache_prefetch_next ((__m128i*)d);
4086 *(uint32_t *)d = data;
4094 *(uint16_t *)d = data;
4105 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4107 pixman_image_t * src_image,
4108 pixman_image_t * mask_image,
4109 pixman_image_t * dst_image,
4120 uint32_t *dst_line, *dst;
4121 uint8_t *mask_line, *mask;
4122 int dst_stride, mask_stride;
4126 __m128i xmm_src, xmm_def;
4127 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4129 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4134 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4135 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4136 dest_x, dest_y, width, height, 0);
4140 PIXMAN_IMAGE_GET_LINE (
4141 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4142 PIXMAN_IMAGE_GET_LINE (
4143 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4145 xmm_def = create_mask_2x32_128 (src, src);
4146 xmm_src = expand_pixel_32_1x128 (src);
4151 dst_line += dst_stride;
4153 mask_line += mask_stride;
4156 /* call prefetch hint to optimize cache load*/
4157 cache_prefetch ((__m128i*)mask);
4158 cache_prefetch ((__m128i*)dst);
4160 while (w && (unsigned long)dst & 15)
4162 uint8_t m = *mask++;
4166 *dst = pack_1x64_32 (
4168 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4179 /* call prefetch hint to optimize cache load*/
4180 cache_prefetch ((__m128i*)mask);
4181 cache_prefetch ((__m128i*)dst);
4185 /* fill cache line with next memory */
4186 cache_prefetch_next ((__m128i*)mask);
4187 cache_prefetch_next ((__m128i*)dst);
4189 m = *((uint32_t*)mask);
4191 if (srca == 0xff && m == 0xffffffff)
4193 save_128_aligned ((__m128i*)dst, xmm_def);
4197 xmm_mask = unpack_32_1x128 (m);
4198 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4201 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4203 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4204 &xmm_mask_lo, &xmm_mask_hi);
4206 pix_multiply_2x128 (&xmm_src, &xmm_src,
4207 &xmm_mask_lo, &xmm_mask_hi,
4208 &xmm_mask_lo, &xmm_mask_hi);
4211 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4215 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4225 uint8_t m = *mask++;
4229 *dst = pack_1x64_32 (
4231 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4246 /*-----------------------------------------------------------------------
4247 * composite_over_n_8_0565
4251 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4253 pixman_image_t * src_image,
4254 pixman_image_t * mask_image,
4255 pixman_image_t * dst_image,
4266 uint16_t *dst_line, *dst, d;
4267 uint8_t *mask_line, *mask;
4268 int dst_stride, mask_stride;
4271 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4273 __m128i xmm_src, xmm_alpha;
4274 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4275 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4277 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4283 PIXMAN_IMAGE_GET_LINE (
4284 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4285 PIXMAN_IMAGE_GET_LINE (
4286 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4288 xmm_src = expand_pixel_32_1x128 (src);
4289 xmm_alpha = expand_alpha_1x128 (xmm_src);
4290 mmx_src = _mm_movepi64_pi64 (xmm_src);
4291 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4296 dst_line += dst_stride;
4298 mask_line += mask_stride;
4301 /* call prefetch hint to optimize cache load*/
4302 cache_prefetch ((__m128i*)mask);
4303 cache_prefetch ((__m128i*)dst);
4305 while (w && (unsigned long)dst & 15)
4312 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4313 mmx_dest = expand565_16_1x64 (d);
4315 *dst = pack_565_32_16 (
4318 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4325 /* call prefetch hint to optimize cache load*/
4326 cache_prefetch ((__m128i*)mask);
4327 cache_prefetch ((__m128i*)dst);
4331 /* fill cache line with next memory */
4332 cache_prefetch_next ((__m128i*)mask);
4333 cache_prefetch_next ((__m128i*)dst);
4335 xmm_dst = load_128_aligned ((__m128i*) dst);
4336 unpack_565_128_4x128 (xmm_dst,
4337 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4339 m = *((uint32_t*)mask);
4344 xmm_mask = unpack_32_1x128 (m);
4345 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4348 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4350 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4351 &xmm_mask_lo, &xmm_mask_hi);
4353 in_over_2x128 (&xmm_src, &xmm_src,
4354 &xmm_alpha, &xmm_alpha,
4355 &xmm_mask_lo, &xmm_mask_hi,
4356 &xmm_dst0, &xmm_dst1);
4359 m = *((uint32_t*)mask);
4364 xmm_mask = unpack_32_1x128 (m);
4365 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4368 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4370 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4371 &xmm_mask_lo, &xmm_mask_hi);
4372 in_over_2x128 (&xmm_src, &xmm_src,
4373 &xmm_alpha, &xmm_alpha,
4374 &xmm_mask_lo, &xmm_mask_hi,
4375 &xmm_dst2, &xmm_dst3);
4379 (__m128i*)dst, pack_565_4x128_128 (
4380 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4393 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4394 mmx_dest = expand565_16_1x64 (d);
4396 *dst = pack_565_32_16 (
4399 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4410 /* -----------------------------------------------------------------------
4411 * composite_over_pixbuf_0565
4415 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4417 pixman_image_t * src_image,
4418 pixman_image_t * mask_image,
4419 pixman_image_t * dst_image,
4429 uint16_t *dst_line, *dst, d;
4430 uint32_t *src_line, *src, s;
4431 int dst_stride, src_stride;
4433 uint32_t opaque, zero;
4436 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4437 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4439 PIXMAN_IMAGE_GET_LINE (
4440 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4441 PIXMAN_IMAGE_GET_LINE (
4442 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4447 * I copy the code from MMX one and keep the fixme.
4448 * If it's a problem there, probably is a problem here.
4450 assert (src_image->drawable == mask_image->drawable);
4456 dst_line += dst_stride;
4458 src_line += src_stride;
4461 /* call prefetch hint to optimize cache load*/
4462 cache_prefetch ((__m128i*)src);
4463 cache_prefetch ((__m128i*)dst);
4465 while (w && (unsigned long)dst & 15)
4470 ms = unpack_32_1x64 (s);
4472 *dst++ = pack_565_32_16 (
4474 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4478 /* call prefetch hint to optimize cache load*/
4479 cache_prefetch ((__m128i*)src);
4480 cache_prefetch ((__m128i*)dst);
4484 /* fill cache line with next memory */
4485 cache_prefetch_next ((__m128i*)src);
4486 cache_prefetch_next ((__m128i*)dst);
4489 xmm_src = load_128_unaligned ((__m128i*)src);
4490 xmm_dst = load_128_aligned ((__m128i*)dst);
4492 opaque = is_opaque (xmm_src);
4493 zero = is_zero (xmm_src);
4495 unpack_565_128_4x128 (xmm_dst,
4496 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4497 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4499 /* preload next round*/
4500 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4504 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4505 &xmm_dst0, &xmm_dst1);
4509 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4510 &xmm_dst0, &xmm_dst1);
4514 opaque = is_opaque (xmm_src);
4515 zero = is_zero (xmm_src);
4517 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4521 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4522 &xmm_dst2, &xmm_dst3);
4526 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4527 &xmm_dst2, &xmm_dst3);
4531 (__m128i*)dst, pack_565_4x128_128 (
4532 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4544 ms = unpack_32_1x64 (s);
4546 *dst++ = pack_565_32_16 (
4548 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4556 /* -------------------------------------------------------------------------
4557 * composite_over_pixbuf_8888
4561 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4563 pixman_image_t * src_image,
4564 pixman_image_t * mask_image,
4565 pixman_image_t * dst_image,
4575 uint32_t *dst_line, *dst, d;
4576 uint32_t *src_line, *src, s;
4577 int dst_stride, src_stride;
4579 uint32_t opaque, zero;
4581 __m128i xmm_src_lo, xmm_src_hi;
4582 __m128i xmm_dst_lo, xmm_dst_hi;
4584 PIXMAN_IMAGE_GET_LINE (
4585 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4586 PIXMAN_IMAGE_GET_LINE (
4587 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4592 * I copy the code from MMX one and keep the fixme.
4593 * If it's a problem there, probably is a problem here.
4595 assert (src_image->drawable == mask_image->drawable);
4601 dst_line += dst_stride;
4603 src_line += src_stride;
4606 /* call prefetch hint to optimize cache load*/
4607 cache_prefetch ((__m128i*)src);
4608 cache_prefetch ((__m128i*)dst);
4610 while (w && (unsigned long)dst & 15)
4615 *dst++ = pack_1x64_32 (
4616 over_rev_non_pre_1x64 (
4617 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4622 /* call prefetch hint to optimize cache load*/
4623 cache_prefetch ((__m128i*)src);
4624 cache_prefetch ((__m128i*)dst);
4628 /* fill cache line with next memory */
4629 cache_prefetch_next ((__m128i*)src);
4630 cache_prefetch_next ((__m128i*)dst);
4632 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4634 opaque = is_opaque (xmm_src_hi);
4635 zero = is_zero (xmm_src_hi);
4637 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4641 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4642 &xmm_dst_lo, &xmm_dst_hi);
4645 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4649 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4651 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4653 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4654 &xmm_dst_lo, &xmm_dst_hi);
4657 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4670 *dst++ = pack_1x64_32 (
4671 over_rev_non_pre_1x64 (
4672 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4681 /* -------------------------------------------------------------------------------------------------
4682 * composite_over_n_8888_0565_ca
4686 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4688 pixman_image_t * src_image,
4689 pixman_image_t * mask_image,
4690 pixman_image_t * dst_image,
4701 uint16_t *dst_line, *dst, d;
4702 uint32_t *mask_line, *mask, m;
4703 int dst_stride, mask_stride;
4707 __m128i xmm_src, xmm_alpha;
4708 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4709 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4711 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4713 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4718 PIXMAN_IMAGE_GET_LINE (
4719 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4720 PIXMAN_IMAGE_GET_LINE (
4721 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4723 xmm_src = expand_pixel_32_1x128 (src);
4724 xmm_alpha = expand_alpha_1x128 (xmm_src);
4725 mmx_src = _mm_movepi64_pi64 (xmm_src);
4726 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4733 mask_line += mask_stride;
4734 dst_line += dst_stride;
4736 /* call prefetch hint to optimize cache load*/
4737 cache_prefetch ((__m128i*)mask);
4738 cache_prefetch ((__m128i*)dst);
4740 while (w && ((unsigned long)dst & 15))
4742 m = *(uint32_t *) mask;
4747 mmx_mask = unpack_32_1x64 (m);
4748 mmx_dest = expand565_16_1x64 (d);
4750 *dst = pack_565_32_16 (
4753 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4761 /* call prefetch hint to optimize cache load*/
4762 cache_prefetch ((__m128i*)mask);
4763 cache_prefetch ((__m128i*)dst);
4767 /* fill cache line with next memory */
4768 cache_prefetch_next ((__m128i*)mask);
4769 cache_prefetch_next ((__m128i*)dst);
4772 xmm_mask = load_128_unaligned ((__m128i*)mask);
4773 xmm_dst = load_128_aligned ((__m128i*)dst);
4775 pack_cmp = _mm_movemask_epi8 (
4776 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4778 unpack_565_128_4x128 (xmm_dst,
4779 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4780 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4782 /* preload next round */
4783 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4785 /* preload next round */
4786 if (pack_cmp != 0xffff)
4788 in_over_2x128 (&xmm_src, &xmm_src,
4789 &xmm_alpha, &xmm_alpha,
4790 &xmm_mask_lo, &xmm_mask_hi,
4791 &xmm_dst0, &xmm_dst1);
4795 pack_cmp = _mm_movemask_epi8 (
4796 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4798 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4800 if (pack_cmp != 0xffff)
4802 in_over_2x128 (&xmm_src, &xmm_src,
4803 &xmm_alpha, &xmm_alpha,
4804 &xmm_mask_lo, &xmm_mask_hi,
4805 &xmm_dst2, &xmm_dst3);
4809 (__m128i*)dst, pack_565_4x128_128 (
4810 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4819 m = *(uint32_t *) mask;
4824 mmx_mask = unpack_32_1x64 (m);
4825 mmx_dest = expand565_16_1x64 (d);
4827 *dst = pack_565_32_16 (
4830 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4842 /* -----------------------------------------------------------------------
4843 * composite_in_n_8_8
4847 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4849 pixman_image_t * src_image,
4850 pixman_image_t * mask_image,
4851 pixman_image_t * dst_image,
4861 uint8_t *dst_line, *dst;
4862 uint8_t *mask_line, *mask;
4863 int dst_stride, mask_stride;
4870 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4871 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4873 PIXMAN_IMAGE_GET_LINE (
4874 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4875 PIXMAN_IMAGE_GET_LINE (
4876 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4878 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4882 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4887 dst_line += dst_stride;
4889 mask_line += mask_stride;
4892 /* call prefetch hint to optimize cache load*/
4893 cache_prefetch ((__m128i*)mask);
4894 cache_prefetch ((__m128i*)dst);
4896 while (w && ((unsigned long)dst & 15))
4898 m = (uint32_t) *mask++;
4899 d = (uint32_t) *dst;
4901 *dst++ = (uint8_t) pack_1x64_32 (
4903 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4904 unpack_32_1x64 (m)),
4905 unpack_32_1x64 (d)));
4909 /* call prefetch hint to optimize cache load*/
4910 cache_prefetch ((__m128i*)mask);
4911 cache_prefetch ((__m128i*)dst);
4915 /* fill cache line with next memory */
4916 cache_prefetch_next ((__m128i*)mask);
4917 cache_prefetch_next ((__m128i*)dst);
4919 xmm_mask = load_128_unaligned ((__m128i*)mask);
4920 xmm_dst = load_128_aligned ((__m128i*)dst);
4922 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4923 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4925 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4926 &xmm_mask_lo, &xmm_mask_hi,
4927 &xmm_mask_lo, &xmm_mask_hi);
4929 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4930 &xmm_dst_lo, &xmm_dst_hi,
4931 &xmm_dst_lo, &xmm_dst_hi);
4934 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4943 m = (uint32_t) *mask++;
4944 d = (uint32_t) *dst;
4946 *dst++ = (uint8_t) pack_1x64_32 (
4949 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4950 unpack_32_1x64 (d)));
4958 /* ---------------------------------------------------------------------------
4963 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4965 pixman_image_t * src_image,
4966 pixman_image_t * mask_image,
4967 pixman_image_t * dst_image,
4977 uint8_t *dst_line, *dst;
4978 uint8_t *src_line, *src;
4979 int src_stride, dst_stride;
4983 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4984 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4986 PIXMAN_IMAGE_GET_LINE (
4987 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4988 PIXMAN_IMAGE_GET_LINE (
4989 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4994 dst_line += dst_stride;
4996 src_line += src_stride;
4999 /* call prefetch hint to optimize cache load*/
5000 cache_prefetch ((__m128i*)src);
5001 cache_prefetch ((__m128i*)dst);
5003 while (w && ((unsigned long)dst & 15))
5005 s = (uint32_t) *src++;
5006 d = (uint32_t) *dst;
5008 *dst++ = (uint8_t) pack_1x64_32 (
5010 unpack_32_1x64 (s), unpack_32_1x64 (d)));
5014 /* call prefetch hint to optimize cache load*/
5015 cache_prefetch ((__m128i*)src);
5016 cache_prefetch ((__m128i*)dst);
5020 /* fill cache line with next memory */
5021 cache_prefetch_next ((__m128i*)src);
5022 cache_prefetch_next ((__m128i*)dst);
5024 xmm_src = load_128_unaligned ((__m128i*)src);
5025 xmm_dst = load_128_aligned ((__m128i*)dst);
5027 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5028 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5030 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5031 &xmm_dst_lo, &xmm_dst_hi,
5032 &xmm_dst_lo, &xmm_dst_hi);
5035 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5044 s = (uint32_t) *src++;
5045 d = (uint32_t) *dst;
5047 *dst++ = (uint8_t) pack_1x64_32 (
5048 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5056 /* -------------------------------------------------------------------------
5057 * composite_add_n_8_8
5061 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
5063 pixman_image_t * src_image,
5064 pixman_image_t * mask_image,
5065 pixman_image_t * dst_image,
5075 uint8_t *dst_line, *dst;
5076 uint8_t *mask_line, *mask;
5077 int dst_stride, mask_stride;
5084 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5085 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5087 PIXMAN_IMAGE_GET_LINE (
5088 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5089 PIXMAN_IMAGE_GET_LINE (
5090 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5092 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5096 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5101 dst_line += dst_stride;
5103 mask_line += mask_stride;
5106 /* call prefetch hint to optimize cache load*/
5107 cache_prefetch ((__m128i*)mask);
5108 cache_prefetch ((__m128i*)dst);
5110 while (w && ((unsigned long)dst & 15))
5112 m = (uint32_t) *mask++;
5113 d = (uint32_t) *dst;
5115 *dst++ = (uint8_t) pack_1x64_32 (
5118 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5119 unpack_32_1x64 (d)));
5123 /* call prefetch hint to optimize cache load*/
5124 cache_prefetch ((__m128i*)mask);
5125 cache_prefetch ((__m128i*)dst);
5129 /* fill cache line with next memory */
5130 cache_prefetch_next ((__m128i*)mask);
5131 cache_prefetch_next ((__m128i*)dst);
5133 xmm_mask = load_128_unaligned ((__m128i*)mask);
5134 xmm_dst = load_128_aligned ((__m128i*)dst);
5136 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5137 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5139 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5140 &xmm_mask_lo, &xmm_mask_hi,
5141 &xmm_mask_lo, &xmm_mask_hi);
5143 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5144 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5147 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5156 m = (uint32_t) *mask++;
5157 d = (uint32_t) *dst;
5159 *dst++ = (uint8_t) pack_1x64_32 (
5162 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5163 unpack_32_1x64 (d)));
5172 /* ----------------------------------------------------------------------
5173 * composite_add_8000_8000
5177 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5179 pixman_image_t * src_image,
5180 pixman_image_t * mask_image,
5181 pixman_image_t * dst_image,
5191 uint8_t *dst_line, *dst;
5192 uint8_t *src_line, *src;
5193 int dst_stride, src_stride;
5197 PIXMAN_IMAGE_GET_LINE (
5198 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5199 PIXMAN_IMAGE_GET_LINE (
5200 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5207 /* call prefetch hint to optimize cache load*/
5208 cache_prefetch ((__m128i*)src);
5209 cache_prefetch ((__m128i*)dst);
5211 dst_line += dst_stride;
5212 src_line += src_stride;
5216 while (w && (unsigned long)dst & 3)
5218 t = (*dst) + (*src++);
5219 *dst++ = t | (0 - (t >> 8));
5223 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5233 t = (*dst) + (*src++);
5234 *dst++ = t | (0 - (t >> 8));
5242 /* ---------------------------------------------------------------------
5243 * composite_add_8888_8888
5246 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5248 pixman_image_t * src_image,
5249 pixman_image_t * mask_image,
5250 pixman_image_t * dst_image,
5260 uint32_t *dst_line, *dst;
5261 uint32_t *src_line, *src;
5262 int dst_stride, src_stride;
5264 PIXMAN_IMAGE_GET_LINE (
5265 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5266 PIXMAN_IMAGE_GET_LINE (
5267 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5272 dst_line += dst_stride;
5274 src_line += src_stride;
5276 core_combine_add_u_sse2 (dst, src, NULL, width);
5282 /* -------------------------------------------------------------------------------------------------
5283 * sse2_composite_copy_area
5286 static pixman_bool_t
5287 pixman_blt_sse2 (uint32_t *src_bits,
5300 uint8_t * src_bytes;
5301 uint8_t * dst_bytes;
5304 if (src_bpp != dst_bpp)
5309 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5310 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5311 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5312 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5313 byte_width = 2 * width;
5317 else if (src_bpp == 32)
5319 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5320 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5321 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5322 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5323 byte_width = 4 * width;
5332 cache_prefetch ((__m128i*)src_bytes);
5333 cache_prefetch ((__m128i*)dst_bytes);
5338 uint8_t *s = src_bytes;
5339 uint8_t *d = dst_bytes;
5340 src_bytes += src_stride;
5341 dst_bytes += dst_stride;
5344 cache_prefetch_next ((__m128i*)s);
5345 cache_prefetch_next ((__m128i*)d);
5347 while (w >= 2 && ((unsigned long)d & 3))
5349 *(uint16_t *)d = *(uint16_t *)s;
5355 while (w >= 4 && ((unsigned long)d & 15))
5357 *(uint32_t *)d = *(uint32_t *)s;
5364 cache_prefetch_next ((__m128i*)s);
5365 cache_prefetch_next ((__m128i*)d);
5369 __m128i xmm0, xmm1, xmm2, xmm3;
5371 /* 128 bytes ahead */
5372 cache_prefetch (((__m128i*)s) + 8);
5373 cache_prefetch (((__m128i*)d) + 8);
5375 xmm0 = load_128_unaligned ((__m128i*)(s));
5376 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5377 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5378 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5380 save_128_aligned ((__m128i*)(d), xmm0);
5381 save_128_aligned ((__m128i*)(d + 16), xmm1);
5382 save_128_aligned ((__m128i*)(d + 32), xmm2);
5383 save_128_aligned ((__m128i*)(d + 48), xmm3);
5390 cache_prefetch_next ((__m128i*)s);
5391 cache_prefetch_next ((__m128i*)d);
5395 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5402 cache_prefetch_next ((__m128i*)s);
5403 cache_prefetch_next ((__m128i*)d);
5407 *(uint32_t *)d = *(uint32_t *)s;
5416 *(uint16_t *)d = *(uint16_t *)s;
5429 sse2_composite_copy_area (pixman_implementation_t *imp,
5431 pixman_image_t * src_image,
5432 pixman_image_t * mask_image,
5433 pixman_image_t * dst_image,
5443 pixman_blt_sse2 (src_image->bits.bits,
5444 dst_image->bits.bits,
5445 src_image->bits.rowstride,
5446 dst_image->bits.rowstride,
5447 PIXMAN_FORMAT_BPP (src_image->bits.format),
5448 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5449 src_x, src_y, dest_x, dest_y, width, height);
5453 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5455 pixman_image_t * src_image,
5456 pixman_image_t * mask_image,
5457 pixman_image_t * dst_image,
5467 uint32_t *src, *src_line, s;
5468 uint32_t *dst, *dst_line, d;
5469 uint8_t *mask, *mask_line;
5471 int src_stride, mask_stride, dst_stride;
5475 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5476 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5477 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5479 PIXMAN_IMAGE_GET_LINE (
5480 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5481 PIXMAN_IMAGE_GET_LINE (
5482 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5483 PIXMAN_IMAGE_GET_LINE (
5484 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5489 src_line += src_stride;
5491 dst_line += dst_stride;
5493 mask_line += mask_stride;
5497 /* call prefetch hint to optimize cache load*/
5498 cache_prefetch ((__m128i*)src);
5499 cache_prefetch ((__m128i*)dst);
5500 cache_prefetch ((__m128i*)mask);
5502 while (w && (unsigned long)dst & 15)
5504 s = 0xff000000 | *src++;
5505 m = (uint32_t) *mask++;
5507 ms = unpack_32_1x64 (s);
5511 __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5512 __m64 md = unpack_32_1x64 (d);
5514 ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
5517 *dst++ = pack_1x64_32 (ms);
5521 /* call prefetch hint to optimize cache load*/
5522 cache_prefetch ((__m128i*)src);
5523 cache_prefetch ((__m128i*)dst);
5524 cache_prefetch ((__m128i*)mask);
5528 /* fill cache line with next memory */
5529 cache_prefetch_next ((__m128i*)src);
5530 cache_prefetch_next ((__m128i*)dst);
5531 cache_prefetch_next ((__m128i*)mask);
5533 m = *(uint32_t*) mask;
5534 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5536 if (m == 0xffffffff)
5538 save_128_aligned ((__m128i*)dst, xmm_src);
5542 xmm_dst = load_128_aligned ((__m128i*)dst);
5544 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5546 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5547 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5548 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5550 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5552 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5554 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5565 m = (uint32_t) *mask++;
5569 s = 0xff000000 | *src;
5581 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
5582 md = unpack_32_1x64 (d);
5583 ms = unpack_32_1x64 (s);
5585 *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
5600 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5602 pixman_image_t * src_image,
5603 pixman_image_t * mask_image,
5604 pixman_image_t * dst_image,
5614 uint32_t *src, *src_line, s;
5615 uint32_t *dst, *dst_line, d;
5616 uint8_t *mask, *mask_line;
5618 int src_stride, mask_stride, dst_stride;
5621 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5622 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5623 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5625 PIXMAN_IMAGE_GET_LINE (
5626 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5627 PIXMAN_IMAGE_GET_LINE (
5628 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5629 PIXMAN_IMAGE_GET_LINE (
5630 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5635 src_line += src_stride;
5637 dst_line += dst_stride;
5639 mask_line += mask_stride;
5643 /* call prefetch hint to optimize cache load*/
5644 cache_prefetch ((__m128i *)src);
5645 cache_prefetch ((__m128i *)dst);
5646 cache_prefetch ((__m128i *)mask);
5648 while (w && (unsigned long)dst & 15)
5653 m = (uint32_t) *mask++;
5660 if (sa == 0xff && m == 0xff)
5666 __m64 ms, md, ma, msa;
5668 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5669 ms = unpack_32_1x64 (s);
5670 md = unpack_32_1x64 (d);
5672 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5674 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5682 /* call prefetch hint to optimize cache load*/
5683 cache_prefetch ((__m128i *)src);
5684 cache_prefetch ((__m128i *)dst);
5685 cache_prefetch ((__m128i *)mask);
5689 /* fill cache line with next memory */
5690 cache_prefetch_next ((__m128i *)src);
5691 cache_prefetch_next ((__m128i *)dst);
5692 cache_prefetch_next ((__m128i *)mask);
5694 m = *(uint32_t *) mask;
5698 xmm_src = load_128_unaligned ((__m128i*)src);
5700 if (m == 0xffffffff && is_opaque (xmm_src))
5702 save_128_aligned ((__m128i *)dst, xmm_src);
5706 xmm_dst = load_128_aligned ((__m128i *)dst);
5708 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5710 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5711 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5712 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5714 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5715 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5717 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5718 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5720 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5735 m = (uint32_t) *mask++;
5742 if (sa == 0xff && m == 0xff)
5748 __m64 ms, md, ma, msa;
5750 ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
5751 ms = unpack_32_1x64 (s);
5752 md = unpack_32_1x64 (d);
5754 msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
5756 *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
5768 static const pixman_fast_path_t sse2_fast_paths[] =
5770 /* PIXMAN_OP_OVER */
5771 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5772 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5773 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5774 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5775 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5776 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5777 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5778 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5779 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5780 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5781 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5782 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5783 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5784 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5785 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5786 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5787 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5788 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5789 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5790 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5791 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5792 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5793 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5794 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5795 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5796 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5797 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5798 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5799 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5800 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5801 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5802 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5803 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5804 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5805 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5806 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5807 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5808 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5809 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5810 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5811 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5812 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5813 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5814 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5815 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5818 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5819 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
5820 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5821 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5822 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5825 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5826 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5827 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5828 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5829 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5830 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5831 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5832 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5833 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5834 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5835 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5836 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5839 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5840 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5845 static pixman_bool_t
5846 sse2_blt (pixman_implementation_t *imp,
5847 uint32_t * src_bits,
5848 uint32_t * dst_bits,
5860 if (!pixman_blt_sse2 (
5861 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5862 src_x, src_y, dst_x, dst_y, width, height))
5865 return _pixman_implementation_blt (
5867 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5868 src_x, src_y, dst_x, dst_y, width, height);
5874 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5875 __attribute__((__force_align_arg_pointer__))
5877 static pixman_bool_t
5878 sse2_fill (pixman_implementation_t *imp,
5888 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5890 return _pixman_implementation_fill (
5891 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5897 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5898 __attribute__((__force_align_arg_pointer__))
5900 pixman_implementation_t *
5901 _pixman_implementation_create_sse2 (void)
5904 pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
5906 pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
5908 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
5910 /* SSE2 constants */
5911 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5912 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5913 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5914 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5915 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5916 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5917 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5918 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5919 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
5920 mask_0080 = create_mask_16_128 (0x0080);
5921 mask_00ff = create_mask_16_128 (0x00ff);
5922 mask_0101 = create_mask_16_128 (0x0101);
5923 mask_ffff = create_mask_16_128 (0xffff);
5924 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5925 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5928 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5929 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5931 mask_x0080 = create_mask_16_64 (0x0080);
5932 mask_x00ff = create_mask_16_64 (0x00ff);
5933 mask_x0101 = create_mask_16_64 (0x0101);
5934 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5938 /* Set up function pointers */
5940 /* SSE code patch for fbcompose.c */
5941 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5942 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5943 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5944 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5945 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5946 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5947 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5948 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5949 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5950 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5952 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5954 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5955 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5956 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5957 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5958 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5959 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5960 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5961 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5962 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5963 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5964 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5966 imp->blt = sse2_blt;
5967 imp->fill = sse2_fill;
5972 #endif /* USE_SSE2 */