2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41 * the pixman-x64-mmx-emulation.h file contains
42 * implementations of those MMX intrinsics that
43 * are used in the SSE2 implementation.
45 # include "pixman-x64-mmx-emulation.h"
50 /* --------------------------------------------------------------------
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
79 /* ----------------------------------------------------------------------
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
85 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
91 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
98 __m128i r, g, b, rb, t;
100 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
104 rb = _mm_or_si128 (r, b);
105 t = _mm_and_si128 (rb, mask_565_fix_rb);
106 t = _mm_srli_epi32 (t, 5);
107 rb = _mm_or_si128 (rb, t);
109 t = _mm_and_si128 (g, mask_565_fix_g);
110 t = _mm_srli_epi32 (t, 6);
111 g = _mm_or_si128 (g, t);
113 return _mm_or_si128 (rb, g);
116 static force_inline void
117 unpack_565_128_4x128 (__m128i data,
125 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
128 lo = unpack_565_to_8888 (lo);
129 hi = unpack_565_to_8888 (hi);
131 unpack_128_2x128 (lo, data0, data1);
132 unpack_128_2x128 (hi, data2, data3);
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
138 return (uint16_t) (((pixel >> 8) & 0xf800) |
139 ((pixel >> 5) & 0x07e0) |
140 ((pixel >> 3) & 0x001f));
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
146 return _mm_packus_epi16 (lo, hi);
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
153 __m128i r, g1, g2, b;
155 data = pack_2x128_128 (lo, hi);
157 r = _mm_and_si128 (data, mask_565_r);
158 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
162 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
168 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169 pack_565_2x128_128 (*xmm2, *xmm3));
172 static force_inline int
173 is_opaque (__m128i x)
175 __m128i ffs = _mm_cmpeq_epi8 (x, x);
177 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
180 static force_inline int
183 return _mm_movemask_epi8 (
184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
187 static force_inline int
188 is_transparent (__m128i x)
190 return (_mm_movemask_epi8 (
191 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
197 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
203 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204 _MM_SHUFFLE (3, 3, 3, 3)),
205 _MM_SHUFFLE (3, 3, 3, 3));
208 static force_inline void
209 expand_alpha_2x128 (__m128i data_lo,
216 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
219 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i data_lo,
231 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
247 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249 lo = _mm_adds_epu16 (lo, mask_0080);
250 hi = _mm_adds_epu16 (hi, mask_0080);
251 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
258 __m128i* alpha_dst_lo,
259 __m128i* alpha_dst_hi,
262 __m128i* alpha_src_lo,
263 __m128i* alpha_src_hi,
268 __m128i mul_lo, mul_hi;
270 lo = _mm_mullo_epi16 (*src_lo, *alpha_dst_lo);
271 hi = _mm_mullo_epi16 (*src_hi, *alpha_dst_hi);
272 mul_lo = _mm_mullo_epi16 (*dst_lo, *alpha_src_lo);
273 mul_hi = _mm_mullo_epi16 (*dst_hi, *alpha_src_hi);
274 lo = _mm_adds_epu16 (lo, mask_0080);
275 hi = _mm_adds_epu16 (hi, mask_0080);
276 lo = _mm_adds_epu16 (lo, mul_lo);
277 hi = _mm_adds_epu16 (hi, mul_hi);
278 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
279 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
282 static force_inline void
283 negate_2x128 (__m128i data_lo,
288 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
289 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
292 static force_inline void
293 invert_colors_2x128 (__m128i data_lo,
300 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
301 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
302 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
303 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
306 static force_inline void
307 over_2x128 (__m128i* src_lo,
316 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
318 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
320 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
321 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
324 static force_inline void
325 over_rev_non_pre_2x128 (__m128i src_lo,
331 __m128i alpha_lo, alpha_hi;
333 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
335 lo = _mm_or_si128 (alpha_lo, mask_alpha);
336 hi = _mm_or_si128 (alpha_hi, mask_alpha);
338 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
340 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
342 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
345 static force_inline void
346 in_over_2x128 (__m128i* src_lo,
358 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
359 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
361 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
364 static force_inline void
365 cache_prefetch (__m128i* addr)
367 _mm_prefetch (addr, _MM_HINT_T0);
370 static force_inline void
371 cache_prefetch_next (__m128i* addr)
373 _mm_prefetch (addr + 4, _MM_HINT_T0); /* 64 bytes ahead */
376 /* load 4 pixels from a 16-byte boundary aligned address */
377 static force_inline __m128i
378 load_128_aligned (__m128i* src)
380 return _mm_load_si128 (src);
383 /* load 4 pixels from a unaligned address */
384 static force_inline __m128i
385 load_128_unaligned (const __m128i* src)
387 return _mm_loadu_si128 (src);
390 /* save 4 pixels using Write Combining memory on a 16-byte
391 * boundary aligned address
393 static force_inline void
394 save_128_write_combining (__m128i* dst,
397 _mm_stream_si128 (dst, data);
400 /* save 4 pixels on a 16-byte boundary aligned address */
401 static force_inline void
402 save_128_aligned (__m128i* dst,
405 _mm_store_si128 (dst, data);
408 /* save 4 pixels on a unaligned address */
409 static force_inline void
410 save_128_unaligned (__m128i* dst,
413 _mm_storeu_si128 (dst, data);
416 /* ------------------------------------------------------------------
420 static force_inline __m64
421 unpack_32_1x64 (uint32_t data)
423 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
426 static force_inline __m64
427 expand_alpha_1x64 (__m64 data)
429 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
432 static force_inline __m64
433 expand_alpha_rev_1x64 (__m64 data)
435 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
438 static force_inline __m64
439 expand_pixel_8_1x64 (uint8_t data)
441 return _mm_shuffle_pi16 (
442 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
445 static force_inline __m64
446 pix_multiply_1x64 (__m64 data,
449 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
454 static force_inline __m64
455 pix_add_multiply_1x64 (__m64* src,
460 return _mm_mulhi_pu16 (
461 _mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alpha_dst),
463 _mm_mullo_pi16 (*dst, *alpha_src)),
467 static force_inline __m64
468 negate_1x64 (__m64 data)
470 return _mm_xor_si64 (data, mask_x00ff);
473 static force_inline __m64
474 invert_colors_1x64 (__m64 data)
476 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
479 static force_inline __m64
480 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
482 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
485 static force_inline __m64
486 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
488 return over_1x64 (pix_multiply_1x64 (*src, *mask),
489 pix_multiply_1x64 (*alpha, *mask),
493 static force_inline __m64
494 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
496 __m64 alpha = expand_alpha_1x64 (src);
498 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
499 _mm_or_si64 (alpha, mask_x_alpha)),
504 static force_inline uint32_t
505 pack_1x64_32 (__m64 data)
507 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
510 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
514 * --- Expanding 565 in the low word ---
516 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
517 * m = m & (01f0003f001f);
518 * m = m * (008404100840);
521 * Note the trick here - the top word is shifted by another nibble to
522 * avoid it bumping into the middle word
524 static force_inline __m64
525 expand565_16_1x64 (uint16_t pixel)
530 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
532 t1 = _mm_slli_si64 (p, 36 - 11);
533 t2 = _mm_slli_si64 (p, 16 - 5);
535 p = _mm_or_si64 (t1, p);
536 p = _mm_or_si64 (t2, p);
537 p = _mm_and_si64 (p, mask_x565_rgb);
538 p = _mm_mullo_pi16 (p, mask_x565_unpack);
540 return _mm_srli_pi16 (p, 8);
543 /* ----------------------------------------------------------------------------
544 * Compose Core transformations
546 static force_inline uint32_t
547 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
560 ms = unpack_32_1x64 (src);
561 return pack_1x64_32 (
562 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
568 static force_inline uint32_t
569 combine1 (const uint32_t *ps, const uint32_t *pm)
577 mm = unpack_32_1x64 (*pm);
578 mm = expand_alpha_1x64 (mm);
580 ms = unpack_32_1x64 (s);
581 ms = pix_multiply_1x64 (ms, mm);
583 s = pack_1x64_32 (ms);
589 static force_inline __m128i
590 combine4 (const __m128i *ps, const __m128i *pm)
592 __m128i xmm_src_lo, xmm_src_hi;
593 __m128i xmm_msk_lo, xmm_msk_hi;
598 xmm_msk_lo = load_128_unaligned (pm);
600 if (is_transparent (xmm_msk_lo))
601 return _mm_setzero_si128 ();
604 s = load_128_unaligned (ps);
608 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
609 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
611 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
613 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
614 &xmm_msk_lo, &xmm_msk_hi,
615 &xmm_src_lo, &xmm_src_hi);
617 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
623 static force_inline void
624 core_combine_over_u_sse2 (uint32_t* pd,
631 __m128i xmm_dst_lo, xmm_dst_hi;
632 __m128i xmm_src_lo, xmm_src_hi;
633 __m128i xmm_alpha_lo, xmm_alpha_hi;
635 /* call prefetch hint to optimize cache load*/
636 cache_prefetch ((__m128i*)ps);
637 cache_prefetch ((__m128i*)pd);
638 cache_prefetch ((__m128i*)pm);
640 /* Align dst on a 16-byte boundary */
641 while (w && ((unsigned long)pd & 15))
644 s = combine1 (ps, pm);
646 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
653 /* call prefetch hint to optimize cache load*/
654 cache_prefetch ((__m128i*)ps);
655 cache_prefetch ((__m128i*)pd);
656 cache_prefetch ((__m128i*)pm);
660 /* fill cache line with next memory */
661 cache_prefetch_next ((__m128i*)ps);
662 cache_prefetch_next ((__m128i*)pd);
663 cache_prefetch_next ((__m128i*)pm);
665 /* I'm loading unaligned because I'm not sure about
666 * the address alignment.
668 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
670 if (is_opaque (xmm_src_hi))
672 save_128_aligned ((__m128i*)pd, xmm_src_hi);
674 else if (!is_zero (xmm_src_hi))
676 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
678 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
679 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
682 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
684 over_2x128 (&xmm_src_lo, &xmm_src_hi,
685 &xmm_alpha_lo, &xmm_alpha_hi,
686 &xmm_dst_lo, &xmm_dst_hi);
688 /* rebuid the 4 pixel data and save*/
689 save_128_aligned ((__m128i*)pd,
690 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
703 s = combine1 (ps, pm);
705 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
714 static force_inline void
715 core_combine_over_reverse_u_sse2 (uint32_t* pd,
722 __m128i xmm_dst_lo, xmm_dst_hi;
723 __m128i xmm_src_lo, xmm_src_hi;
724 __m128i xmm_alpha_lo, xmm_alpha_hi;
726 /* call prefetch hint to optimize cache load*/
727 cache_prefetch ((__m128i*)ps);
728 cache_prefetch ((__m128i*)pd);
729 cache_prefetch ((__m128i*)pm);
731 /* Align dst on a 16-byte boundary */
733 ((unsigned long)pd & 15))
736 s = combine1 (ps, pm);
738 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
745 /* call prefetch hint to optimize cache load*/
746 cache_prefetch ((__m128i*)ps);
747 cache_prefetch ((__m128i*)pd);
748 cache_prefetch ((__m128i*)pm);
752 /* fill cache line with next memory */
753 cache_prefetch_next ((__m128i*)ps);
754 cache_prefetch_next ((__m128i*)pd);
755 cache_prefetch_next ((__m128i*)pm);
757 /* I'm loading unaligned because I'm not sure
758 * about the address alignment.
760 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
761 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
763 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
764 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
766 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
767 &xmm_alpha_lo, &xmm_alpha_hi);
769 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
770 &xmm_alpha_lo, &xmm_alpha_hi,
771 &xmm_src_lo, &xmm_src_hi);
773 /* rebuid the 4 pixel data and save*/
774 save_128_aligned ((__m128i*)pd,
775 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
788 s = combine1 (ps, pm);
790 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
798 static force_inline uint32_t
799 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
801 uint32_t maska = src >> 24;
807 else if (maska != 0xff)
809 return pack_1x64_32 (
810 pix_multiply_1x64 (unpack_32_1x64 (dst),
811 expand_alpha_1x64 (unpack_32_1x64 (src))));
817 static force_inline void
818 core_combine_in_u_sse2 (uint32_t* pd,
825 __m128i xmm_src_lo, xmm_src_hi;
826 __m128i xmm_dst_lo, xmm_dst_hi;
828 /* call prefetch hint to optimize cache load*/
829 cache_prefetch ((__m128i*)ps);
830 cache_prefetch ((__m128i*)pd);
831 cache_prefetch ((__m128i*)pm);
833 while (w && ((unsigned long) pd & 15))
835 s = combine1 (ps, pm);
838 *pd++ = core_combine_in_u_pixelsse2 (d, s);
845 /* call prefetch hint to optimize cache load*/
846 cache_prefetch ((__m128i*)ps);
847 cache_prefetch ((__m128i*)pd);
848 cache_prefetch ((__m128i*)pm);
852 /* fill cache line with next memory */
853 cache_prefetch_next ((__m128i*)ps);
854 cache_prefetch_next ((__m128i*)pd);
855 cache_prefetch_next ((__m128i*)pm);
857 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
858 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
860 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
861 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
863 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
865 &xmm_dst_lo, &xmm_dst_hi,
866 &xmm_dst_lo, &xmm_dst_hi);
868 save_128_aligned ((__m128i*)pd,
869 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
880 s = combine1 (ps, pm);
883 *pd++ = core_combine_in_u_pixelsse2 (d, s);
891 static force_inline void
892 core_combine_reverse_in_u_sse2 (uint32_t* pd,
899 __m128i xmm_src_lo, xmm_src_hi;
900 __m128i xmm_dst_lo, xmm_dst_hi;
902 /* call prefetch hint to optimize cache load*/
903 cache_prefetch ((__m128i*)ps);
904 cache_prefetch ((__m128i*)pd);
905 cache_prefetch ((__m128i*)pm);
907 while (w && ((unsigned long) pd & 15))
909 s = combine1 (ps, pm);
912 *pd++ = core_combine_in_u_pixelsse2 (s, d);
919 /* call prefetch hint to optimize cache load*/
920 cache_prefetch ((__m128i*)ps);
921 cache_prefetch ((__m128i*)pd);
922 cache_prefetch ((__m128i*)pm);
926 /* fill cache line with next memory */
927 cache_prefetch_next ((__m128i*)ps);
928 cache_prefetch_next ((__m128i*)pd);
929 cache_prefetch_next ((__m128i*)pm);
931 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
932 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
934 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
935 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
937 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
938 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
939 &xmm_src_lo, &xmm_src_hi,
940 &xmm_dst_lo, &xmm_dst_hi);
943 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
954 s = combine1 (ps, pm);
957 *pd++ = core_combine_in_u_pixelsse2 (s, d);
965 static force_inline void
966 core_combine_reverse_out_u_sse2 (uint32_t* pd,
971 /* call prefetch hint to optimize cache load*/
972 cache_prefetch ((__m128i*)ps);
973 cache_prefetch ((__m128i*)pd);
974 cache_prefetch ((__m128i*)pm);
976 while (w && ((unsigned long) pd & 15))
978 uint32_t s = combine1 (ps, pm);
981 *pd++ = pack_1x64_32 (
983 unpack_32_1x64 (d), negate_1x64 (
984 expand_alpha_1x64 (unpack_32_1x64 (s)))));
992 /* call prefetch hint to optimize cache load*/
993 cache_prefetch ((__m128i*)ps);
994 cache_prefetch ((__m128i*)pd);
995 cache_prefetch ((__m128i*)pm);
999 __m128i xmm_src_lo, xmm_src_hi;
1000 __m128i xmm_dst_lo, xmm_dst_hi;
1002 /* fill cache line with next memory */
1003 cache_prefetch_next ((__m128i*)ps);
1004 cache_prefetch_next ((__m128i*)pd);
1005 cache_prefetch_next ((__m128i*)pm);
1007 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1008 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1010 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1011 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1013 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1014 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1016 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1017 &xmm_src_lo, &xmm_src_hi,
1018 &xmm_dst_lo, &xmm_dst_hi);
1021 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1033 uint32_t s = combine1 (ps, pm);
1036 *pd++ = pack_1x64_32 (
1038 unpack_32_1x64 (d), negate_1x64 (
1039 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1047 static force_inline void
1048 core_combine_out_u_sse2 (uint32_t* pd,
1053 /* call prefetch hint to optimize cache load*/
1054 cache_prefetch ((__m128i*)ps);
1055 cache_prefetch ((__m128i*)pd);
1056 cache_prefetch ((__m128i*)pm);
1058 while (w && ((unsigned long) pd & 15))
1060 uint32_t s = combine1 (ps, pm);
1063 *pd++ = pack_1x64_32 (
1065 unpack_32_1x64 (s), negate_1x64 (
1066 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1073 /* call prefetch hint to optimize cache load*/
1074 cache_prefetch ((__m128i*)ps);
1075 cache_prefetch ((__m128i*)pd);
1076 cache_prefetch ((__m128i*)pm);
1080 __m128i xmm_src_lo, xmm_src_hi;
1081 __m128i xmm_dst_lo, xmm_dst_hi;
1083 /* fill cache line with next memory */
1084 cache_prefetch_next ((__m128i*)ps);
1085 cache_prefetch_next ((__m128i*)pd);
1086 cache_prefetch_next ((__m128i*)pm);
1088 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1089 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1091 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1092 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1094 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1095 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1097 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1098 &xmm_dst_lo, &xmm_dst_hi,
1099 &xmm_dst_lo, &xmm_dst_hi);
1102 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1113 uint32_t s = combine1 (ps, pm);
1116 *pd++ = pack_1x64_32 (
1118 unpack_32_1x64 (s), negate_1x64 (
1119 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1127 static force_inline uint32_t
1128 core_combine_atop_u_pixel_sse2 (uint32_t src,
1131 __m64 s = unpack_32_1x64 (src);
1132 __m64 d = unpack_32_1x64 (dst);
1134 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1135 __m64 da = expand_alpha_1x64 (d);
1137 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1140 static force_inline void
1141 core_combine_atop_u_sse2 (uint32_t* pd,
1148 __m128i xmm_src_lo, xmm_src_hi;
1149 __m128i xmm_dst_lo, xmm_dst_hi;
1150 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1151 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1153 /* call prefetch hint to optimize cache load*/
1154 cache_prefetch ((__m128i*)ps);
1155 cache_prefetch ((__m128i*)pd);
1156 cache_prefetch ((__m128i*)pm);
1158 while (w && ((unsigned long) pd & 15))
1160 s = combine1 (ps, pm);
1163 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1170 /* call prefetch hint to optimize cache load*/
1171 cache_prefetch ((__m128i*)ps);
1172 cache_prefetch ((__m128i*)pd);
1173 cache_prefetch ((__m128i*)pm);
1177 /* fill cache line with next memory */
1178 cache_prefetch_next ((__m128i*)ps);
1179 cache_prefetch_next ((__m128i*)pd);
1180 cache_prefetch_next ((__m128i*)pm);
1182 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1183 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1185 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1186 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1188 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1189 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1190 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1191 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1193 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1194 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1196 pix_add_multiply_2x128 (
1197 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1198 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1199 &xmm_dst_lo, &xmm_dst_hi);
1202 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1213 s = combine1 (ps, pm);
1216 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1224 static force_inline uint32_t
1225 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1228 __m64 s = unpack_32_1x64 (src);
1229 __m64 d = unpack_32_1x64 (dst);
1231 __m64 sa = expand_alpha_1x64 (s);
1232 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1234 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1237 static force_inline void
1238 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1245 __m128i xmm_src_lo, xmm_src_hi;
1246 __m128i xmm_dst_lo, xmm_dst_hi;
1247 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1248 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1250 /* call prefetch hint to optimize cache load*/
1251 cache_prefetch ((__m128i*)ps);
1252 cache_prefetch ((__m128i*)pd);
1253 cache_prefetch ((__m128i*)pm);
1255 while (w && ((unsigned long) pd & 15))
1257 s = combine1 (ps, pm);
1260 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1267 /* call prefetch hint to optimize cache load*/
1268 cache_prefetch ((__m128i*)ps);
1269 cache_prefetch ((__m128i*)pd);
1270 cache_prefetch ((__m128i*)pm);
1274 /* fill cache line with next memory */
1275 cache_prefetch_next ((__m128i*)ps);
1276 cache_prefetch_next ((__m128i*)pd);
1277 cache_prefetch_next ((__m128i*)pm);
1279 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1280 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1282 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1283 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1285 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1286 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1287 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1288 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1290 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1291 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1293 pix_add_multiply_2x128 (
1294 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1295 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1296 &xmm_dst_lo, &xmm_dst_hi);
1299 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1310 s = combine1 (ps, pm);
1313 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1321 static force_inline uint32_t
1322 core_combine_xor_u_pixel_sse2 (uint32_t src,
1325 __m64 s = unpack_32_1x64 (src);
1326 __m64 d = unpack_32_1x64 (dst);
1328 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1329 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1331 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1334 static force_inline void
1335 core_combine_xor_u_sse2 (uint32_t* dst,
1336 const uint32_t* src,
1337 const uint32_t *mask,
1343 const uint32_t* ps = src;
1344 const uint32_t* pm = mask;
1346 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1347 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1348 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1349 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1351 /* call prefetch hint to optimize cache load*/
1352 cache_prefetch ((__m128i*)ps);
1353 cache_prefetch ((__m128i*)pd);
1354 cache_prefetch ((__m128i*)pm);
1356 while (w && ((unsigned long) pd & 15))
1358 s = combine1 (ps, pm);
1361 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1368 /* call prefetch hint to optimize cache load*/
1369 cache_prefetch ((__m128i*)ps);
1370 cache_prefetch ((__m128i*)pd);
1371 cache_prefetch ((__m128i*)pm);
1375 /* fill cache line with next memory */
1376 cache_prefetch_next ((__m128i*)ps);
1377 cache_prefetch_next ((__m128i*)pd);
1378 cache_prefetch_next ((__m128i*)pm);
1380 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1381 xmm_dst = load_128_aligned ((__m128i*) pd);
1383 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1384 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1386 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1387 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1388 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1389 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1391 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1392 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1393 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1394 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1396 pix_add_multiply_2x128 (
1397 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1398 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1399 &xmm_dst_lo, &xmm_dst_hi);
1402 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1413 s = combine1 (ps, pm);
1416 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1424 static force_inline void
1425 core_combine_add_u_sse2 (uint32_t* dst,
1426 const uint32_t* src,
1427 const uint32_t* mask,
1433 const uint32_t* ps = src;
1434 const uint32_t* pm = mask;
1436 /* call prefetch hint to optimize cache load*/
1437 cache_prefetch ((__m128i*)ps);
1438 cache_prefetch ((__m128i*)pd);
1439 cache_prefetch ((__m128i*)pm);
1441 while (w && (unsigned long)pd & 15)
1443 s = combine1 (ps, pm);
1449 *pd++ = _mm_cvtsi64_si32 (
1450 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1454 /* call prefetch hint to optimize cache load*/
1455 cache_prefetch ((__m128i*)ps);
1456 cache_prefetch ((__m128i*)pd);
1457 cache_prefetch ((__m128i*)pm);
1463 /* fill cache line with next memory */
1464 cache_prefetch_next ((__m128i*)ps);
1465 cache_prefetch_next ((__m128i*)pd);
1466 cache_prefetch_next ((__m128i*)pm);
1468 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1471 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1482 s = combine1 (ps, pm);
1486 *pd++ = _mm_cvtsi64_si32 (
1487 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1493 static force_inline uint32_t
1494 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1497 __m64 ms = unpack_32_1x64 (src);
1498 __m64 md = unpack_32_1x64 (dst);
1499 uint32_t sa = src >> 24;
1500 uint32_t da = ~dst >> 24;
1504 ms = pix_multiply_1x64 (
1505 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1508 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1511 static force_inline void
1512 core_combine_saturate_u_sse2 (uint32_t * pd,
1520 __m128i xmm_src, xmm_dst;
1522 /* call prefetch hint to optimize cache load*/
1523 cache_prefetch ((__m128i*)ps);
1524 cache_prefetch ((__m128i*)pd);
1525 cache_prefetch ((__m128i*)pm);
1527 while (w && (unsigned long)pd & 15)
1529 s = combine1 (ps, pm);
1532 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1539 /* call prefetch hint to optimize cache load*/
1540 cache_prefetch ((__m128i*)ps);
1541 cache_prefetch ((__m128i*)pd);
1542 cache_prefetch ((__m128i*)pm);
1546 /* fill cache line with next memory */
1547 cache_prefetch_next ((__m128i*)ps);
1548 cache_prefetch_next ((__m128i*)pd);
1549 cache_prefetch_next ((__m128i*)pm);
1551 xmm_dst = load_128_aligned ((__m128i*)pd);
1552 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1554 pack_cmp = _mm_movemask_epi8 (
1556 _mm_srli_epi32 (xmm_src, 24),
1557 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1559 /* if some alpha src is grater than respective ~alpha dst */
1562 s = combine1 (ps++, pm);
1564 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1568 s = combine1 (ps++, pm);
1570 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1574 s = combine1 (ps++, pm);
1576 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1580 s = combine1 (ps++, pm);
1582 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1588 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1601 s = combine1 (ps, pm);
1604 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1611 static force_inline void
1612 core_combine_src_ca_sse2 (uint32_t* pd,
1619 __m128i xmm_src_lo, xmm_src_hi;
1620 __m128i xmm_mask_lo, xmm_mask_hi;
1621 __m128i xmm_dst_lo, xmm_dst_hi;
1623 /* call prefetch hint to optimize cache load*/
1624 cache_prefetch ((__m128i*)ps);
1625 cache_prefetch ((__m128i*)pd);
1626 cache_prefetch ((__m128i*)pm);
1628 while (w && (unsigned long)pd & 15)
1632 *pd++ = pack_1x64_32 (
1633 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1637 /* call prefetch hint to optimize cache load*/
1638 cache_prefetch ((__m128i*)ps);
1639 cache_prefetch ((__m128i*)pd);
1640 cache_prefetch ((__m128i*)pm);
1644 /* fill cache line with next memory */
1645 cache_prefetch_next ((__m128i*)ps);
1646 cache_prefetch_next ((__m128i*)pd);
1647 cache_prefetch_next ((__m128i*)pm);
1649 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1650 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1652 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1653 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1655 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1656 &xmm_mask_lo, &xmm_mask_hi,
1657 &xmm_dst_lo, &xmm_dst_hi);
1660 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1672 *pd++ = pack_1x64_32 (
1673 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1678 static force_inline uint32_t
1679 core_combine_over_ca_pixel_sse2 (uint32_t src,
1683 __m64 s = unpack_32_1x64 (src);
1684 __m64 expAlpha = expand_alpha_1x64 (s);
1685 __m64 unpk_mask = unpack_32_1x64 (mask);
1686 __m64 unpk_dst = unpack_32_1x64 (dst);
1688 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1691 static force_inline void
1692 core_combine_over_ca_sse2 (uint32_t* pd,
1699 __m128i xmm_alpha_lo, xmm_alpha_hi;
1700 __m128i xmm_src_lo, xmm_src_hi;
1701 __m128i xmm_dst_lo, xmm_dst_hi;
1702 __m128i xmm_mask_lo, xmm_mask_hi;
1704 /* call prefetch hint to optimize cache load*/
1705 cache_prefetch ((__m128i*)ps);
1706 cache_prefetch ((__m128i*)pd);
1707 cache_prefetch ((__m128i*)pm);
1709 while (w && (unsigned long)pd & 15)
1715 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1719 /* call prefetch hint to optimize cache load*/
1720 cache_prefetch ((__m128i*)ps);
1721 cache_prefetch ((__m128i*)pd);
1722 cache_prefetch ((__m128i*)pm);
1726 /* fill cache line with next memory */
1727 cache_prefetch_next ((__m128i*)ps);
1728 cache_prefetch_next ((__m128i*)pd);
1729 cache_prefetch_next ((__m128i*)pm);
1731 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1732 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1733 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1735 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1736 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1737 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1739 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1740 &xmm_alpha_lo, &xmm_alpha_hi);
1742 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1743 &xmm_alpha_lo, &xmm_alpha_hi,
1744 &xmm_mask_lo, &xmm_mask_hi,
1745 &xmm_dst_lo, &xmm_dst_hi);
1748 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1762 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1767 static force_inline uint32_t
1768 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1772 __m64 d = unpack_32_1x64 (dst);
1774 return pack_1x64_32 (
1775 over_1x64 (d, expand_alpha_1x64 (d),
1776 pix_multiply_1x64 (unpack_32_1x64 (src),
1777 unpack_32_1x64 (mask))));
1780 static force_inline void
1781 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1788 __m128i xmm_alpha_lo, xmm_alpha_hi;
1789 __m128i xmm_src_lo, xmm_src_hi;
1790 __m128i xmm_dst_lo, xmm_dst_hi;
1791 __m128i xmm_mask_lo, xmm_mask_hi;
1793 /* call prefetch hint to optimize cache load*/
1794 cache_prefetch ((__m128i*)ps);
1795 cache_prefetch ((__m128i*)pd);
1796 cache_prefetch ((__m128i*)pm);
1798 while (w && (unsigned long)pd & 15)
1804 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1808 /* call prefetch hint to optimize cache load*/
1809 cache_prefetch ((__m128i*)ps);
1810 cache_prefetch ((__m128i*)pd);
1811 cache_prefetch ((__m128i*)pm);
1815 /* fill cache line with next memory */
1816 cache_prefetch_next ((__m128i*)ps);
1817 cache_prefetch_next ((__m128i*)pd);
1818 cache_prefetch_next ((__m128i*)pm);
1820 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1821 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1822 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1824 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1825 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1826 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1828 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1829 &xmm_alpha_lo, &xmm_alpha_hi);
1830 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1831 &xmm_mask_lo, &xmm_mask_hi,
1832 &xmm_mask_lo, &xmm_mask_hi);
1834 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1835 &xmm_alpha_lo, &xmm_alpha_hi,
1836 &xmm_mask_lo, &xmm_mask_hi);
1839 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1853 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1858 static force_inline void
1859 core_combine_in_ca_sse2 (uint32_t * pd,
1866 __m128i xmm_alpha_lo, xmm_alpha_hi;
1867 __m128i xmm_src_lo, xmm_src_hi;
1868 __m128i xmm_dst_lo, xmm_dst_hi;
1869 __m128i xmm_mask_lo, xmm_mask_hi;
1871 /* call prefetch hint to optimize cache load*/
1872 cache_prefetch ((__m128i*)ps);
1873 cache_prefetch ((__m128i*)pd);
1874 cache_prefetch ((__m128i*)pm);
1876 while (w && (unsigned long)pd & 15)
1882 *pd++ = pack_1x64_32 (
1884 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1885 expand_alpha_1x64 (unpack_32_1x64 (d))));
1890 /* call prefetch hint to optimize cache load*/
1891 cache_prefetch ((__m128i*)ps);
1892 cache_prefetch ((__m128i*)pd);
1893 cache_prefetch ((__m128i*)pm);
1897 /* fill cache line with next memory */
1898 cache_prefetch_next ((__m128i*)ps);
1899 cache_prefetch_next ((__m128i*)pd);
1900 cache_prefetch_next ((__m128i*)pm);
1902 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1903 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1904 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1906 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1907 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1908 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1910 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1911 &xmm_alpha_lo, &xmm_alpha_hi);
1913 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1914 &xmm_mask_lo, &xmm_mask_hi,
1915 &xmm_dst_lo, &xmm_dst_hi);
1917 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1918 &xmm_alpha_lo, &xmm_alpha_hi,
1919 &xmm_dst_lo, &xmm_dst_hi);
1922 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1936 *pd++ = pack_1x64_32 (
1939 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1940 expand_alpha_1x64 (unpack_32_1x64 (d))));
1946 static force_inline void
1947 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1954 __m128i xmm_alpha_lo, xmm_alpha_hi;
1955 __m128i xmm_src_lo, xmm_src_hi;
1956 __m128i xmm_dst_lo, xmm_dst_hi;
1957 __m128i xmm_mask_lo, xmm_mask_hi;
1959 /* call prefetch hint to optimize cache load*/
1960 cache_prefetch ((__m128i*)ps);
1961 cache_prefetch ((__m128i*)pd);
1962 cache_prefetch ((__m128i*)pm);
1964 while (w && (unsigned long)pd & 15)
1970 *pd++ = pack_1x64_32 (
1973 pix_multiply_1x64 (unpack_32_1x64 (m),
1974 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1978 /* call prefetch hint to optimize cache load*/
1979 cache_prefetch ((__m128i*)ps);
1980 cache_prefetch ((__m128i*)pd);
1981 cache_prefetch ((__m128i*)pm);
1985 /* fill cache line with next memory */
1986 cache_prefetch_next ((__m128i*)ps);
1987 cache_prefetch_next ((__m128i*)pd);
1988 cache_prefetch_next ((__m128i*)pm);
1990 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1991 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1992 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1994 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1995 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1996 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1998 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1999 &xmm_alpha_lo, &xmm_alpha_hi);
2000 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2001 &xmm_alpha_lo, &xmm_alpha_hi,
2002 &xmm_alpha_lo, &xmm_alpha_hi);
2004 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2005 &xmm_alpha_lo, &xmm_alpha_hi,
2006 &xmm_dst_lo, &xmm_dst_hi);
2009 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2023 *pd++ = pack_1x64_32 (
2026 pix_multiply_1x64 (unpack_32_1x64 (m),
2027 expand_alpha_1x64 (unpack_32_1x64 (s)))));
2032 static force_inline void
2033 core_combine_out_ca_sse2 (uint32_t * pd,
2040 __m128i xmm_alpha_lo, xmm_alpha_hi;
2041 __m128i xmm_src_lo, xmm_src_hi;
2042 __m128i xmm_dst_lo, xmm_dst_hi;
2043 __m128i xmm_mask_lo, xmm_mask_hi;
2045 /* call prefetch hint to optimize cache load*/
2046 cache_prefetch ((__m128i*)ps);
2047 cache_prefetch ((__m128i*)pd);
2048 cache_prefetch ((__m128i*)pm);
2050 while (w && (unsigned long)pd & 15)
2056 *pd++ = pack_1x64_32 (
2059 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2060 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2064 /* call prefetch hint to optimize cache load*/
2065 cache_prefetch ((__m128i*)ps);
2066 cache_prefetch ((__m128i*)pd);
2067 cache_prefetch ((__m128i*)pm);
2071 /* fill cache line with next memory */
2072 cache_prefetch_next ((__m128i*)ps);
2073 cache_prefetch_next ((__m128i*)pd);
2074 cache_prefetch_next ((__m128i*)pm);
2076 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2077 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2078 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2080 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2081 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2082 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2084 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2085 &xmm_alpha_lo, &xmm_alpha_hi);
2086 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2087 &xmm_alpha_lo, &xmm_alpha_hi);
2089 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2090 &xmm_mask_lo, &xmm_mask_hi,
2091 &xmm_dst_lo, &xmm_dst_hi);
2092 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2093 &xmm_alpha_lo, &xmm_alpha_hi,
2094 &xmm_dst_lo, &xmm_dst_hi);
2097 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2111 *pd++ = pack_1x64_32 (
2114 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2115 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2121 static force_inline void
2122 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
2129 __m128i xmm_alpha_lo, xmm_alpha_hi;
2130 __m128i xmm_src_lo, xmm_src_hi;
2131 __m128i xmm_dst_lo, xmm_dst_hi;
2132 __m128i xmm_mask_lo, xmm_mask_hi;
2134 /* call prefetch hint to optimize cache load*/
2135 cache_prefetch ((__m128i*)ps);
2136 cache_prefetch ((__m128i*)pd);
2137 cache_prefetch ((__m128i*)pm);
2139 while (w && (unsigned long)pd & 15)
2145 *pd++ = pack_1x64_32 (
2148 negate_1x64 (pix_multiply_1x64 (
2150 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2154 /* call prefetch hint to optimize cache load*/
2155 cache_prefetch ((__m128i*)ps);
2156 cache_prefetch ((__m128i*)pd);
2157 cache_prefetch ((__m128i*)pm);
2161 /* fill cache line with next memory */
2162 cache_prefetch_next ((__m128i*)ps);
2163 cache_prefetch_next ((__m128i*)pd);
2164 cache_prefetch_next ((__m128i*)pm);
2166 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2167 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2168 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2170 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2171 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2172 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2174 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2175 &xmm_alpha_lo, &xmm_alpha_hi);
2177 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2178 &xmm_alpha_lo, &xmm_alpha_hi,
2179 &xmm_mask_lo, &xmm_mask_hi);
2181 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2182 &xmm_mask_lo, &xmm_mask_hi);
2184 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2185 &xmm_mask_lo, &xmm_mask_hi,
2186 &xmm_dst_lo, &xmm_dst_hi);
2189 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2203 *pd++ = pack_1x64_32 (
2206 negate_1x64 (pix_multiply_1x64 (
2208 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2213 static force_inline uint32_t
2214 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2218 __m64 m = unpack_32_1x64 (mask);
2219 __m64 s = unpack_32_1x64 (src);
2220 __m64 d = unpack_32_1x64 (dst);
2221 __m64 sa = expand_alpha_1x64 (s);
2222 __m64 da = expand_alpha_1x64 (d);
2224 s = pix_multiply_1x64 (s, m);
2225 m = negate_1x64 (pix_multiply_1x64 (m, sa));
2227 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2230 static force_inline void
2231 core_combine_atop_ca_sse2 (uint32_t * pd,
2238 __m128i xmm_src_lo, xmm_src_hi;
2239 __m128i xmm_dst_lo, xmm_dst_hi;
2240 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2241 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2242 __m128i xmm_mask_lo, xmm_mask_hi;
2244 /* call prefetch hint to optimize cache load*/
2245 cache_prefetch ((__m128i*)ps);
2246 cache_prefetch ((__m128i*)pd);
2247 cache_prefetch ((__m128i*)pm);
2249 while (w && (unsigned long)pd & 15)
2255 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2259 /* call prefetch hint to optimize cache load*/
2260 cache_prefetch ((__m128i*)ps);
2261 cache_prefetch ((__m128i*)pd);
2262 cache_prefetch ((__m128i*)pm);
2266 /* fill cache line with next memory */
2267 cache_prefetch_next ((__m128i*)ps);
2268 cache_prefetch_next ((__m128i*)pd);
2269 cache_prefetch_next ((__m128i*)pm);
2271 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2272 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2273 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2275 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2276 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2277 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2279 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2280 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2281 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2282 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2284 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2285 &xmm_mask_lo, &xmm_mask_hi,
2286 &xmm_src_lo, &xmm_src_hi);
2287 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2288 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2289 &xmm_mask_lo, &xmm_mask_hi);
2291 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2293 pix_add_multiply_2x128 (
2294 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2295 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2296 &xmm_dst_lo, &xmm_dst_hi);
2299 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2313 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2318 static force_inline uint32_t
2319 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2323 __m64 m = unpack_32_1x64 (mask);
2324 __m64 s = unpack_32_1x64 (src);
2325 __m64 d = unpack_32_1x64 (dst);
2327 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2328 __m64 sa = expand_alpha_1x64 (s);
2330 s = pix_multiply_1x64 (s, m);
2331 m = pix_multiply_1x64 (m, sa);
2333 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2336 static force_inline void
2337 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2344 __m128i xmm_src_lo, xmm_src_hi;
2345 __m128i xmm_dst_lo, xmm_dst_hi;
2346 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2347 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2348 __m128i xmm_mask_lo, xmm_mask_hi;
2350 /* call prefetch hint to optimize cache load*/
2351 cache_prefetch ((__m128i*)ps);
2352 cache_prefetch ((__m128i*)pd);
2353 cache_prefetch ((__m128i*)pm);
2355 while (w && (unsigned long)pd & 15)
2361 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2365 /* call prefetch hint to optimize cache load*/
2366 cache_prefetch ((__m128i*)ps);
2367 cache_prefetch ((__m128i*)pd);
2368 cache_prefetch ((__m128i*)pm);
2372 /* fill cache line with next memory */
2373 cache_prefetch_next ((__m128i*)ps);
2374 cache_prefetch_next ((__m128i*)pd);
2375 cache_prefetch_next ((__m128i*)pm);
2377 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2378 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2379 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2381 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2382 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2383 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2385 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2386 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2387 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2388 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2390 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2391 &xmm_mask_lo, &xmm_mask_hi,
2392 &xmm_src_lo, &xmm_src_hi);
2393 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2394 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2395 &xmm_mask_lo, &xmm_mask_hi);
2397 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2398 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2400 pix_add_multiply_2x128 (
2401 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2402 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2403 &xmm_dst_lo, &xmm_dst_hi);
2406 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2420 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2425 static force_inline uint32_t
2426 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2430 __m64 a = unpack_32_1x64 (mask);
2431 __m64 s = unpack_32_1x64 (src);
2432 __m64 d = unpack_32_1x64 (dst);
2434 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2435 a, expand_alpha_1x64 (s)));
2436 __m64 dest = pix_multiply_1x64 (s, a);
2437 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2439 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2445 static force_inline void
2446 core_combine_xor_ca_sse2 (uint32_t * pd,
2453 __m128i xmm_src_lo, xmm_src_hi;
2454 __m128i xmm_dst_lo, xmm_dst_hi;
2455 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2456 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2457 __m128i xmm_mask_lo, xmm_mask_hi;
2459 /* call prefetch hint to optimize cache load*/
2460 cache_prefetch ((__m128i*)ps);
2461 cache_prefetch ((__m128i*)pd);
2462 cache_prefetch ((__m128i*)pm);
2464 while (w && (unsigned long)pd & 15)
2470 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2474 /* call prefetch hint to optimize cache load*/
2475 cache_prefetch ((__m128i*)ps);
2476 cache_prefetch ((__m128i*)pd);
2477 cache_prefetch ((__m128i*)pm);
2481 /* fill cache line with next memory */
2482 cache_prefetch_next ((__m128i*)ps);
2483 cache_prefetch_next ((__m128i*)pd);
2484 cache_prefetch_next ((__m128i*)pm);
2486 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2487 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2488 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2490 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2491 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2492 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2494 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2495 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2496 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2497 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2499 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2500 &xmm_mask_lo, &xmm_mask_hi,
2501 &xmm_src_lo, &xmm_src_hi);
2502 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2503 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2504 &xmm_mask_lo, &xmm_mask_hi);
2506 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2507 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2508 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2509 &xmm_mask_lo, &xmm_mask_hi);
2511 pix_add_multiply_2x128 (
2512 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2513 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2514 &xmm_dst_lo, &xmm_dst_hi);
2517 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2531 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2536 static force_inline void
2537 core_combine_add_ca_sse2 (uint32_t * pd,
2544 __m128i xmm_src_lo, xmm_src_hi;
2545 __m128i xmm_dst_lo, xmm_dst_hi;
2546 __m128i xmm_mask_lo, xmm_mask_hi;
2548 /* call prefetch hint to optimize cache load*/
2549 cache_prefetch ((__m128i*)ps);
2550 cache_prefetch ((__m128i*)pd);
2551 cache_prefetch ((__m128i*)pm);
2553 while (w && (unsigned long)pd & 15)
2559 *pd++ = pack_1x64_32 (
2560 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2561 unpack_32_1x64 (m)),
2562 unpack_32_1x64 (d)));
2566 /* call prefetch hint to optimize cache load*/
2567 cache_prefetch ((__m128i*)ps);
2568 cache_prefetch ((__m128i*)pd);
2569 cache_prefetch ((__m128i*)pm);
2573 /* fill cache line with next memory */
2574 cache_prefetch_next ((__m128i*)ps);
2575 cache_prefetch_next ((__m128i*)pd);
2576 cache_prefetch_next ((__m128i*)pm);
2578 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2579 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2580 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2582 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2583 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2584 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2586 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2587 &xmm_mask_lo, &xmm_mask_hi,
2588 &xmm_src_lo, &xmm_src_hi);
2591 (__m128i*)pd, pack_2x128_128 (
2592 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2593 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2607 *pd++ = pack_1x64_32 (
2608 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2609 unpack_32_1x64 (m)),
2610 unpack_32_1x64 (d)));
2615 /* ---------------------------------------------------
2616 * fb_compose_setup_sSE2
2618 static force_inline __m64
2619 create_mask_16_64 (uint16_t mask)
2621 return _mm_set1_pi16 (mask);
2624 static force_inline __m128i
2625 create_mask_16_128 (uint16_t mask)
2627 return _mm_set1_epi16 (mask);
2630 static force_inline __m64
2631 create_mask_2x32_64 (uint32_t mask0,
2634 return _mm_set_pi32 (mask0, mask1);
2637 static force_inline __m128i
2638 create_mask_2x32_128 (uint32_t mask0,
2641 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2644 /* SSE2 code patch for fbcompose.c */
2647 sse2_combine_over_u (pixman_implementation_t *imp,
2650 const uint32_t * src,
2651 const uint32_t * mask,
2654 core_combine_over_u_sse2 (dst, src, mask, width);
2659 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2662 const uint32_t * src,
2663 const uint32_t * mask,
2666 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2671 sse2_combine_in_u (pixman_implementation_t *imp,
2674 const uint32_t * src,
2675 const uint32_t * mask,
2678 core_combine_in_u_sse2 (dst, src, mask, width);
2683 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2686 const uint32_t * src,
2687 const uint32_t * mask,
2690 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2695 sse2_combine_out_u (pixman_implementation_t *imp,
2698 const uint32_t * src,
2699 const uint32_t * mask,
2702 core_combine_out_u_sse2 (dst, src, mask, width);
2707 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2710 const uint32_t * src,
2711 const uint32_t * mask,
2714 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2719 sse2_combine_atop_u (pixman_implementation_t *imp,
2722 const uint32_t * src,
2723 const uint32_t * mask,
2726 core_combine_atop_u_sse2 (dst, src, mask, width);
2731 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2734 const uint32_t * src,
2735 const uint32_t * mask,
2738 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2743 sse2_combine_xor_u (pixman_implementation_t *imp,
2746 const uint32_t * src,
2747 const uint32_t * mask,
2750 core_combine_xor_u_sse2 (dst, src, mask, width);
2755 sse2_combine_add_u (pixman_implementation_t *imp,
2758 const uint32_t * src,
2759 const uint32_t * mask,
2762 core_combine_add_u_sse2 (dst, src, mask, width);
2767 sse2_combine_saturate_u (pixman_implementation_t *imp,
2770 const uint32_t * src,
2771 const uint32_t * mask,
2774 core_combine_saturate_u_sse2 (dst, src, mask, width);
2779 sse2_combine_src_ca (pixman_implementation_t *imp,
2782 const uint32_t * src,
2783 const uint32_t * mask,
2786 core_combine_src_ca_sse2 (dst, src, mask, width);
2791 sse2_combine_over_ca (pixman_implementation_t *imp,
2794 const uint32_t * src,
2795 const uint32_t * mask,
2798 core_combine_over_ca_sse2 (dst, src, mask, width);
2803 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2806 const uint32_t * src,
2807 const uint32_t * mask,
2810 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2815 sse2_combine_in_ca (pixman_implementation_t *imp,
2818 const uint32_t * src,
2819 const uint32_t * mask,
2822 core_combine_in_ca_sse2 (dst, src, mask, width);
2827 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2830 const uint32_t * src,
2831 const uint32_t * mask,
2834 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2839 sse2_combine_out_ca (pixman_implementation_t *imp,
2842 const uint32_t * src,
2843 const uint32_t * mask,
2846 core_combine_out_ca_sse2 (dst, src, mask, width);
2851 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2854 const uint32_t * src,
2855 const uint32_t * mask,
2858 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2863 sse2_combine_atop_ca (pixman_implementation_t *imp,
2866 const uint32_t * src,
2867 const uint32_t * mask,
2870 core_combine_atop_ca_sse2 (dst, src, mask, width);
2875 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2878 const uint32_t * src,
2879 const uint32_t * mask,
2882 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2887 sse2_combine_xor_ca (pixman_implementation_t *imp,
2890 const uint32_t * src,
2891 const uint32_t * mask,
2894 core_combine_xor_ca_sse2 (dst, src, mask, width);
2899 sse2_combine_add_ca (pixman_implementation_t *imp,
2902 const uint32_t * src,
2903 const uint32_t * mask,
2906 core_combine_add_ca_sse2 (dst, src, mask, width);
2910 /* -------------------------------------------------------------------
2911 * composite_over_n_8888
2915 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2917 pixman_image_t * src_image,
2918 pixman_image_t * mask_image,
2919 pixman_image_t * dst_image,
2930 uint32_t *dst_line, *dst, d;
2933 __m128i xmm_src, xmm_alpha;
2934 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2936 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2941 PIXMAN_IMAGE_GET_LINE (
2942 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2944 xmm_src = expand_pixel_32_1x128 (src);
2945 xmm_alpha = expand_alpha_1x128 (xmm_src);
2951 /* call prefetch hint to optimize cache load*/
2952 cache_prefetch ((__m128i*)dst);
2954 dst_line += dst_stride;
2957 while (w && (unsigned long)dst & 15)
2960 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2961 _mm_movepi64_pi64 (xmm_alpha),
2962 unpack_32_1x64 (d)));
2966 cache_prefetch ((__m128i*)dst);
2970 /* fill cache line with next memory */
2971 cache_prefetch_next ((__m128i*)dst);
2973 xmm_dst = load_128_aligned ((__m128i*)dst);
2975 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2977 over_2x128 (&xmm_src, &xmm_src,
2978 &xmm_alpha, &xmm_alpha,
2979 &xmm_dst_lo, &xmm_dst_hi);
2981 /* rebuid the 4 pixel data and save*/
2983 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2992 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2993 _mm_movepi64_pi64 (xmm_alpha),
2994 unpack_32_1x64 (d)));
3002 /* ---------------------------------------------------------------------
3003 * composite_over_n_0565
3006 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3008 pixman_image_t * src_image,
3009 pixman_image_t * mask_image,
3010 pixman_image_t * dst_image,
3021 uint16_t *dst_line, *dst, d;
3024 __m128i xmm_src, xmm_alpha;
3025 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3027 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3032 PIXMAN_IMAGE_GET_LINE (
3033 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3035 xmm_src = expand_pixel_32_1x128 (src);
3036 xmm_alpha = expand_alpha_1x128 (xmm_src);
3042 /* call prefetch hint to optimize cache load*/
3043 cache_prefetch ((__m128i*)dst);
3045 dst_line += dst_stride;
3048 while (w && (unsigned long)dst & 15)
3052 *dst++ = pack_565_32_16 (
3053 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3054 _mm_movepi64_pi64 (xmm_alpha),
3055 expand565_16_1x64 (d))));
3059 /* call prefetch hint to optimize cache load*/
3060 cache_prefetch ((__m128i*)dst);
3064 /* fill cache line with next memory */
3065 cache_prefetch_next ((__m128i*)dst);
3067 xmm_dst = load_128_aligned ((__m128i*)dst);
3069 unpack_565_128_4x128 (xmm_dst,
3070 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3072 over_2x128 (&xmm_src, &xmm_src,
3073 &xmm_alpha, &xmm_alpha,
3074 &xmm_dst0, &xmm_dst1);
3075 over_2x128 (&xmm_src, &xmm_src,
3076 &xmm_alpha, &xmm_alpha,
3077 &xmm_dst2, &xmm_dst3);
3079 xmm_dst = pack_565_4x128_128 (
3080 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3082 save_128_aligned ((__m128i*)dst, xmm_dst);
3091 *dst++ = pack_565_32_16 (
3092 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3093 _mm_movepi64_pi64 (xmm_alpha),
3094 expand565_16_1x64 (d))));
3101 /* ------------------------------
3102 * composite_add_n_8888_8888_ca
3105 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3107 pixman_image_t * src_image,
3108 pixman_image_t * mask_image,
3109 pixman_image_t * dst_image,
3120 uint32_t *dst_line, d;
3121 uint32_t *mask_line, m;
3123 int dst_stride, mask_stride;
3125 __m128i xmm_src, xmm_alpha;
3127 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3129 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3131 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3137 PIXMAN_IMAGE_GET_LINE (
3138 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3139 PIXMAN_IMAGE_GET_LINE (
3140 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3142 xmm_src = _mm_unpacklo_epi8 (
3143 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3144 xmm_alpha = expand_alpha_1x128 (xmm_src);
3145 mmx_src = _mm_movepi64_pi64 (xmm_src);
3146 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3151 const uint32_t *pm = (uint32_t *)mask_line;
3152 uint32_t *pd = (uint32_t *)dst_line;
3154 dst_line += dst_stride;
3155 mask_line += mask_stride;
3157 /* call prefetch hint to optimize cache load*/
3158 cache_prefetch ((__m128i*)pd);
3159 cache_prefetch ((__m128i*)pm);
3161 while (w && (unsigned long)pd & 15)
3169 mmx_mask = unpack_32_1x64 (m);
3170 mmx_dest = unpack_32_1x64 (d);
3172 *pd = pack_1x64_32 (
3173 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3180 /* call prefetch hint to optimize cache load*/
3181 cache_prefetch ((__m128i*)pd);
3182 cache_prefetch ((__m128i*)pm);
3186 /* fill cache line with next memory */
3187 cache_prefetch_next ((__m128i*)pd);
3188 cache_prefetch_next ((__m128i*)pm);
3190 xmm_mask = load_128_unaligned ((__m128i*)pm);
3194 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3196 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3197 if (pack_cmp != 0xffff)
3199 xmm_dst = load_128_aligned ((__m128i*)pd);
3201 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3203 pix_multiply_2x128 (&xmm_src, &xmm_src,
3204 &xmm_mask_lo, &xmm_mask_hi,
3205 &xmm_mask_lo, &xmm_mask_hi);
3206 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3209 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3225 mmx_mask = unpack_32_1x64 (m);
3226 mmx_dest = unpack_32_1x64 (d);
3228 *pd = pack_1x64_32 (
3229 _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3240 /* ---------------------------------------------------------------------------
3241 * composite_over_n_8888_8888_ca
3245 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3247 pixman_image_t * src_image,
3248 pixman_image_t * mask_image,
3249 pixman_image_t * dst_image,
3260 uint32_t *dst_line, d;
3261 uint32_t *mask_line, m;
3263 int dst_stride, mask_stride;
3265 __m128i xmm_src, xmm_alpha;
3266 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3267 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3269 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3271 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3276 PIXMAN_IMAGE_GET_LINE (
3277 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3278 PIXMAN_IMAGE_GET_LINE (
3279 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3281 xmm_src = _mm_unpacklo_epi8 (
3282 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3283 xmm_alpha = expand_alpha_1x128 (xmm_src);
3284 mmx_src = _mm_movepi64_pi64 (xmm_src);
3285 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3290 const uint32_t *pm = (uint32_t *)mask_line;
3291 uint32_t *pd = (uint32_t *)dst_line;
3293 dst_line += dst_stride;
3294 mask_line += mask_stride;
3296 /* call prefetch hint to optimize cache load*/
3297 cache_prefetch ((__m128i*)pd);
3298 cache_prefetch ((__m128i*)pm);
3300 while (w && (unsigned long)pd & 15)
3307 mmx_mask = unpack_32_1x64 (m);
3308 mmx_dest = unpack_32_1x64 (d);
3310 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3320 /* call prefetch hint to optimize cache load*/
3321 cache_prefetch ((__m128i*)pd);
3322 cache_prefetch ((__m128i*)pm);
3326 /* fill cache line with next memory */
3327 cache_prefetch_next ((__m128i*)pd);
3328 cache_prefetch_next ((__m128i*)pm);
3330 xmm_mask = load_128_unaligned ((__m128i*)pm);
3334 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3336 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3337 if (pack_cmp != 0xffff)
3339 xmm_dst = load_128_aligned ((__m128i*)pd);
3341 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3342 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3344 in_over_2x128 (&xmm_src, &xmm_src,
3345 &xmm_alpha, &xmm_alpha,
3346 &xmm_mask_lo, &xmm_mask_hi,
3347 &xmm_dst_lo, &xmm_dst_hi);
3350 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3365 mmx_mask = unpack_32_1x64 (m);
3366 mmx_dest = unpack_32_1x64 (d);
3368 *pd = pack_1x64_32 (
3369 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3380 /*---------------------------------------------------------------------
3381 * composite_over_8888_n_8888
3385 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3387 pixman_image_t * src_image,
3388 pixman_image_t * mask_image,
3389 pixman_image_t * dst_image,
3399 uint32_t *dst_line, *dst;
3400 uint32_t *src_line, *src;
3403 int dst_stride, src_stride;
3406 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3407 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3408 __m128i xmm_alpha_lo, xmm_alpha_hi;
3410 PIXMAN_IMAGE_GET_LINE (
3411 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3412 PIXMAN_IMAGE_GET_LINE (
3413 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3415 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3417 xmm_mask = create_mask_16_128 (mask >> 24);
3422 dst_line += dst_stride;
3424 src_line += src_stride;
3427 /* call prefetch hint to optimize cache load*/
3428 cache_prefetch ((__m128i*)dst);
3429 cache_prefetch ((__m128i*)src);
3431 while (w && (unsigned long)dst & 15)
3433 uint32_t s = *src++;
3436 __m64 ms = unpack_32_1x64 (s);
3437 __m64 alpha = expand_alpha_1x64 (ms);
3438 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3439 __m64 alpha_dst = unpack_32_1x64 (d);
3441 *dst++ = pack_1x64_32 (
3442 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3447 /* call prefetch hint to optimize cache load*/
3448 cache_prefetch ((__m128i*)dst);
3449 cache_prefetch ((__m128i*)src);
3453 /* fill cache line with next memory */
3454 cache_prefetch_next ((__m128i*)dst);
3455 cache_prefetch_next ((__m128i*)src);
3457 xmm_src = load_128_unaligned ((__m128i*)src);
3458 xmm_dst = load_128_aligned ((__m128i*)dst);
3460 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3461 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3462 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3463 &xmm_alpha_lo, &xmm_alpha_hi);
3465 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3466 &xmm_alpha_lo, &xmm_alpha_hi,
3467 &xmm_mask, &xmm_mask,
3468 &xmm_dst_lo, &xmm_dst_hi);
3471 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3480 uint32_t s = *src++;
3483 __m64 ms = unpack_32_1x64 (s);
3484 __m64 alpha = expand_alpha_1x64 (ms);
3485 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3486 __m64 dest = unpack_32_1x64 (d);
3488 *dst++ = pack_1x64_32 (
3489 in_over_1x64 (&ms, &alpha, &mask, &dest));
3498 /* ---------------------------------------------------------------------
3499 * composite_over_x888_n_8888
3502 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3504 pixman_image_t * src_image,
3505 pixman_image_t * mask_image,
3506 pixman_image_t * dst_image,
3516 uint32_t *dst_line, *dst;
3517 uint32_t *src_line, *src;
3519 int dst_stride, src_stride;
3522 __m128i xmm_mask, xmm_alpha;
3523 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3524 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3526 PIXMAN_IMAGE_GET_LINE (
3527 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3528 PIXMAN_IMAGE_GET_LINE (
3529 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3531 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3533 xmm_mask = create_mask_16_128 (mask >> 24);
3534 xmm_alpha = mask_00ff;
3539 dst_line += dst_stride;
3541 src_line += src_stride;
3544 /* call prefetch hint to optimize cache load*/
3545 cache_prefetch ((__m128i*)dst);
3546 cache_prefetch ((__m128i*)src);
3548 while (w && (unsigned long)dst & 15)
3550 uint32_t s = (*src++) | 0xff000000;
3553 __m64 src = unpack_32_1x64 (s);
3554 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3555 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3556 __m64 dest = unpack_32_1x64 (d);
3558 *dst++ = pack_1x64_32 (
3559 in_over_1x64 (&src, &alpha, &mask, &dest));
3564 /* call prefetch hint to optimize cache load*/
3565 cache_prefetch ((__m128i*)dst);
3566 cache_prefetch ((__m128i*)src);
3570 /* fill cache line with next memory */
3571 cache_prefetch_next ((__m128i*)dst);
3572 cache_prefetch_next ((__m128i*)src);
3574 xmm_src = _mm_or_si128 (
3575 load_128_unaligned ((__m128i*)src), mask_ff000000);
3576 xmm_dst = load_128_aligned ((__m128i*)dst);
3578 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3579 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3581 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3582 &xmm_alpha, &xmm_alpha,
3583 &xmm_mask, &xmm_mask,
3584 &xmm_dst_lo, &xmm_dst_hi);
3587 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3597 uint32_t s = (*src++) | 0xff000000;
3600 __m64 src = unpack_32_1x64 (s);
3601 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3602 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3603 __m64 dest = unpack_32_1x64 (d);
3605 *dst++ = pack_1x64_32 (
3606 in_over_1x64 (&src, &alpha, &mask, &dest));
3615 /* --------------------------------------------------------------------
3616 * composite_over_8888_8888
3619 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3621 pixman_image_t * src_image,
3622 pixman_image_t * mask_image,
3623 pixman_image_t * dst_image,
3633 int dst_stride, src_stride;
3634 uint32_t *dst_line, *dst;
3635 uint32_t *src_line, *src;
3637 PIXMAN_IMAGE_GET_LINE (
3638 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3639 PIXMAN_IMAGE_GET_LINE (
3640 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3647 core_combine_over_u_sse2 (dst, src, NULL, width);
3655 /* ------------------------------------------------------------------
3656 * composite_over_8888_0565
3658 static force_inline uint16_t
3659 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3663 ms = unpack_32_1x64 (src);
3664 return pack_565_32_16 (
3667 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3671 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3673 pixman_image_t * src_image,
3674 pixman_image_t * mask_image,
3675 pixman_image_t * dst_image,
3685 uint16_t *dst_line, *dst, d;
3686 uint32_t *src_line, *src, s;
3687 int dst_stride, src_stride;
3690 __m128i xmm_alpha_lo, xmm_alpha_hi;
3691 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3692 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3694 PIXMAN_IMAGE_GET_LINE (
3695 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3696 PIXMAN_IMAGE_GET_LINE (
3697 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3702 * I copy the code from MMX one and keep the fixme.
3703 * If it's a problem there, probably is a problem here.
3705 assert (src_image->drawable == mask_image->drawable);
3713 /* call prefetch hint to optimize cache load*/
3714 cache_prefetch ((__m128i*)src);
3715 cache_prefetch ((__m128i*)dst);
3717 dst_line += dst_stride;
3718 src_line += src_stride;
3721 /* Align dst on a 16-byte boundary */
3723 ((unsigned long)dst & 15))
3728 *dst++ = composite_over_8888_0565pixel (s, d);
3732 /* call prefetch hint to optimize cache load*/
3733 cache_prefetch ((__m128i*)src);
3734 cache_prefetch ((__m128i*)dst);
3736 /* It's a 8 pixel loop */
3739 /* fill cache line with next memory */
3740 cache_prefetch_next ((__m128i*)src);
3741 cache_prefetch_next ((__m128i*)dst);
3743 /* I'm loading unaligned because I'm not sure
3744 * about the address alignment.
3746 xmm_src = load_128_unaligned ((__m128i*) src);
3747 xmm_dst = load_128_aligned ((__m128i*) dst);
3750 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3751 unpack_565_128_4x128 (xmm_dst,
3752 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3753 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3754 &xmm_alpha_lo, &xmm_alpha_hi);
3756 /* I'm loading next 4 pixels from memory
3757 * before to optimze the memory read.
3759 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3761 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3762 &xmm_alpha_lo, &xmm_alpha_hi,
3763 &xmm_dst0, &xmm_dst1);
3766 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3767 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3768 &xmm_alpha_lo, &xmm_alpha_hi);
3770 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3771 &xmm_alpha_lo, &xmm_alpha_hi,
3772 &xmm_dst2, &xmm_dst3);
3775 (__m128i*)dst, pack_565_4x128_128 (
3776 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3788 *dst++ = composite_over_8888_0565pixel (s, d);
3795 /* -----------------------------------------------------------------
3796 * composite_over_n_8_8888
3800 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3802 pixman_image_t * src_image,
3803 pixman_image_t * mask_image,
3804 pixman_image_t * dst_image,
3815 uint32_t *dst_line, *dst;
3816 uint8_t *mask_line, *mask;
3817 int dst_stride, mask_stride;
3821 __m128i xmm_src, xmm_alpha, xmm_def;
3822 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3823 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3825 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3827 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3833 PIXMAN_IMAGE_GET_LINE (
3834 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3835 PIXMAN_IMAGE_GET_LINE (
3836 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3838 xmm_def = create_mask_2x32_128 (src, src);
3839 xmm_src = expand_pixel_32_1x128 (src);
3840 xmm_alpha = expand_alpha_1x128 (xmm_src);
3841 mmx_src = _mm_movepi64_pi64 (xmm_src);
3842 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3847 dst_line += dst_stride;
3849 mask_line += mask_stride;
3852 /* call prefetch hint to optimize cache load*/
3853 cache_prefetch ((__m128i*)mask);
3854 cache_prefetch ((__m128i*)dst);
3856 while (w && (unsigned long)dst & 15)
3858 uint8_t m = *mask++;
3863 mmx_mask = expand_pixel_8_1x64 (m);
3864 mmx_dest = unpack_32_1x64 (d);
3866 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3876 /* call prefetch hint to optimize cache load*/
3877 cache_prefetch ((__m128i*)mask);
3878 cache_prefetch ((__m128i*)dst);
3882 /* fill cache line with next memory */
3883 cache_prefetch_next ((__m128i*)mask);
3884 cache_prefetch_next ((__m128i*)dst);
3886 m = *((uint32_t*)mask);
3888 if (srca == 0xff && m == 0xffffffff)
3890 save_128_aligned ((__m128i*)dst, xmm_def);
3894 xmm_dst = load_128_aligned ((__m128i*) dst);
3895 xmm_mask = unpack_32_1x128 (m);
3896 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3899 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3900 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3902 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3903 &xmm_mask_lo, &xmm_mask_hi);
3905 in_over_2x128 (&xmm_src, &xmm_src,
3906 &xmm_alpha, &xmm_alpha,
3907 &xmm_mask_lo, &xmm_mask_hi,
3908 &xmm_dst_lo, &xmm_dst_hi);
3911 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3921 uint8_t m = *mask++;
3926 mmx_mask = expand_pixel_8_1x64 (m);
3927 mmx_dest = unpack_32_1x64 (d);
3929 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3943 /* ----------------------------------------------------------------
3944 * composite_over_n_8_8888
3948 pixman_fill_sse2 (uint32_t *bits,
3957 uint32_t byte_width;
3962 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3965 if (bpp != 16 && bpp != 32)
3970 stride = stride * (int) sizeof (uint32_t) / 2;
3971 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3972 byte_width = 2 * width;
3977 stride = stride * (int) sizeof (uint32_t) / 4;
3978 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3979 byte_width = 4 * width;
3983 cache_prefetch ((__m128i*)byte_line);
3984 xmm_def = create_mask_2x32_128 (data, data);
3989 uint8_t *d = byte_line;
3990 byte_line += stride;
3994 cache_prefetch_next ((__m128i*)d);
3996 while (w >= 2 && ((unsigned long)d & 3))
3998 *(uint16_t *)d = data;
4003 while (w >= 4 && ((unsigned long)d & 15))
4005 *(uint32_t *)d = data;
4011 cache_prefetch_next ((__m128i*)d);
4015 cache_prefetch (((__m128i*)d) + 12);
4017 save_128_aligned ((__m128i*)(d), xmm_def);
4018 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4019 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4020 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4021 save_128_aligned ((__m128i*)(d + 64), xmm_def);
4022 save_128_aligned ((__m128i*)(d + 80), xmm_def);
4023 save_128_aligned ((__m128i*)(d + 96), xmm_def);
4024 save_128_aligned ((__m128i*)(d + 112), xmm_def);
4032 cache_prefetch (((__m128i*)d) + 8);
4034 save_128_aligned ((__m128i*)(d), xmm_def);
4035 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4036 save_128_aligned ((__m128i*)(d + 32), xmm_def);
4037 save_128_aligned ((__m128i*)(d + 48), xmm_def);
4043 cache_prefetch_next ((__m128i*)d);
4047 save_128_aligned ((__m128i*)(d), xmm_def);
4048 save_128_aligned ((__m128i*)(d + 16), xmm_def);
4056 save_128_aligned ((__m128i*)(d), xmm_def);
4062 cache_prefetch_next ((__m128i*)d);
4066 *(uint32_t *)d = data;
4074 *(uint16_t *)d = data;
4085 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4087 pixman_image_t * src_image,
4088 pixman_image_t * mask_image,
4089 pixman_image_t * dst_image,
4100 uint32_t *dst_line, *dst;
4101 uint8_t *mask_line, *mask;
4102 int dst_stride, mask_stride;
4106 __m128i xmm_src, xmm_def;
4107 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4109 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4114 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4115 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4116 dest_x, dest_y, width, height, 0);
4120 PIXMAN_IMAGE_GET_LINE (
4121 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4122 PIXMAN_IMAGE_GET_LINE (
4123 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4125 xmm_def = create_mask_2x32_128 (src, src);
4126 xmm_src = expand_pixel_32_1x128 (src);
4131 dst_line += dst_stride;
4133 mask_line += mask_stride;
4136 /* call prefetch hint to optimize cache load*/
4137 cache_prefetch ((__m128i*)mask);
4138 cache_prefetch ((__m128i*)dst);
4140 while (w && (unsigned long)dst & 15)
4142 uint8_t m = *mask++;
4146 *dst = pack_1x64_32 (
4148 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4159 /* call prefetch hint to optimize cache load*/
4160 cache_prefetch ((__m128i*)mask);
4161 cache_prefetch ((__m128i*)dst);
4165 /* fill cache line with next memory */
4166 cache_prefetch_next ((__m128i*)mask);
4167 cache_prefetch_next ((__m128i*)dst);
4169 m = *((uint32_t*)mask);
4171 if (srca == 0xff && m == 0xffffffff)
4173 save_128_aligned ((__m128i*)dst, xmm_def);
4177 xmm_mask = unpack_32_1x128 (m);
4178 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4181 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4183 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4184 &xmm_mask_lo, &xmm_mask_hi);
4186 pix_multiply_2x128 (&xmm_src, &xmm_src,
4187 &xmm_mask_lo, &xmm_mask_hi,
4188 &xmm_mask_lo, &xmm_mask_hi);
4191 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4195 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4205 uint8_t m = *mask++;
4209 *dst = pack_1x64_32 (
4211 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4226 /*-----------------------------------------------------------------------
4227 * composite_over_n_8_0565
4231 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4233 pixman_image_t * src_image,
4234 pixman_image_t * mask_image,
4235 pixman_image_t * dst_image,
4246 uint16_t *dst_line, *dst, d;
4247 uint8_t *mask_line, *mask;
4248 int dst_stride, mask_stride;
4251 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4253 __m128i xmm_src, xmm_alpha;
4254 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4255 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4257 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4263 PIXMAN_IMAGE_GET_LINE (
4264 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4265 PIXMAN_IMAGE_GET_LINE (
4266 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4268 xmm_src = expand_pixel_32_1x128 (src);
4269 xmm_alpha = expand_alpha_1x128 (xmm_src);
4270 mmx_src = _mm_movepi64_pi64 (xmm_src);
4271 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4276 dst_line += dst_stride;
4278 mask_line += mask_stride;
4281 /* call prefetch hint to optimize cache load*/
4282 cache_prefetch ((__m128i*)mask);
4283 cache_prefetch ((__m128i*)dst);
4285 while (w && (unsigned long)dst & 15)
4292 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4293 mmx_dest = expand565_16_1x64 (d);
4295 *dst = pack_565_32_16 (
4298 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4305 /* call prefetch hint to optimize cache load*/
4306 cache_prefetch ((__m128i*)mask);
4307 cache_prefetch ((__m128i*)dst);
4311 /* fill cache line with next memory */
4312 cache_prefetch_next ((__m128i*)mask);
4313 cache_prefetch_next ((__m128i*)dst);
4315 xmm_dst = load_128_aligned ((__m128i*) dst);
4316 unpack_565_128_4x128 (xmm_dst,
4317 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4319 m = *((uint32_t*)mask);
4324 xmm_mask = unpack_32_1x128 (m);
4325 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4328 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4330 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4331 &xmm_mask_lo, &xmm_mask_hi);
4333 in_over_2x128 (&xmm_src, &xmm_src,
4334 &xmm_alpha, &xmm_alpha,
4335 &xmm_mask_lo, &xmm_mask_hi,
4336 &xmm_dst0, &xmm_dst1);
4339 m = *((uint32_t*)mask);
4344 xmm_mask = unpack_32_1x128 (m);
4345 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4348 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4350 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4351 &xmm_mask_lo, &xmm_mask_hi);
4352 in_over_2x128 (&xmm_src, &xmm_src,
4353 &xmm_alpha, &xmm_alpha,
4354 &xmm_mask_lo, &xmm_mask_hi,
4355 &xmm_dst2, &xmm_dst3);
4359 (__m128i*)dst, pack_565_4x128_128 (
4360 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4373 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4374 mmx_dest = expand565_16_1x64 (d);
4376 *dst = pack_565_32_16 (
4379 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4390 /* -----------------------------------------------------------------------
4391 * composite_over_pixbuf_0565
4395 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4397 pixman_image_t * src_image,
4398 pixman_image_t * mask_image,
4399 pixman_image_t * dst_image,
4409 uint16_t *dst_line, *dst, d;
4410 uint32_t *src_line, *src, s;
4411 int dst_stride, src_stride;
4413 uint32_t opaque, zero;
4416 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4417 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4419 PIXMAN_IMAGE_GET_LINE (
4420 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4421 PIXMAN_IMAGE_GET_LINE (
4422 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4427 * I copy the code from MMX one and keep the fixme.
4428 * If it's a problem there, probably is a problem here.
4430 assert (src_image->drawable == mask_image->drawable);
4436 dst_line += dst_stride;
4438 src_line += src_stride;
4441 /* call prefetch hint to optimize cache load*/
4442 cache_prefetch ((__m128i*)src);
4443 cache_prefetch ((__m128i*)dst);
4445 while (w && (unsigned long)dst & 15)
4450 ms = unpack_32_1x64 (s);
4452 *dst++ = pack_565_32_16 (
4454 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4458 /* call prefetch hint to optimize cache load*/
4459 cache_prefetch ((__m128i*)src);
4460 cache_prefetch ((__m128i*)dst);
4464 /* fill cache line with next memory */
4465 cache_prefetch_next ((__m128i*)src);
4466 cache_prefetch_next ((__m128i*)dst);
4469 xmm_src = load_128_unaligned ((__m128i*)src);
4470 xmm_dst = load_128_aligned ((__m128i*)dst);
4472 opaque = is_opaque (xmm_src);
4473 zero = is_zero (xmm_src);
4475 unpack_565_128_4x128 (xmm_dst,
4476 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4477 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4479 /* preload next round*/
4480 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4484 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4485 &xmm_dst0, &xmm_dst1);
4489 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4490 &xmm_dst0, &xmm_dst1);
4494 opaque = is_opaque (xmm_src);
4495 zero = is_zero (xmm_src);
4497 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4501 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4502 &xmm_dst2, &xmm_dst3);
4506 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4507 &xmm_dst2, &xmm_dst3);
4511 (__m128i*)dst, pack_565_4x128_128 (
4512 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4524 ms = unpack_32_1x64 (s);
4526 *dst++ = pack_565_32_16 (
4528 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4536 /* -------------------------------------------------------------------------
4537 * composite_over_pixbuf_8888
4541 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4543 pixman_image_t * src_image,
4544 pixman_image_t * mask_image,
4545 pixman_image_t * dst_image,
4555 uint32_t *dst_line, *dst, d;
4556 uint32_t *src_line, *src, s;
4557 int dst_stride, src_stride;
4559 uint32_t opaque, zero;
4561 __m128i xmm_src_lo, xmm_src_hi;
4562 __m128i xmm_dst_lo, xmm_dst_hi;
4564 PIXMAN_IMAGE_GET_LINE (
4565 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4566 PIXMAN_IMAGE_GET_LINE (
4567 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4572 * I copy the code from MMX one and keep the fixme.
4573 * If it's a problem there, probably is a problem here.
4575 assert (src_image->drawable == mask_image->drawable);
4581 dst_line += dst_stride;
4583 src_line += src_stride;
4586 /* call prefetch hint to optimize cache load*/
4587 cache_prefetch ((__m128i*)src);
4588 cache_prefetch ((__m128i*)dst);
4590 while (w && (unsigned long)dst & 15)
4595 *dst++ = pack_1x64_32 (
4596 over_rev_non_pre_1x64 (
4597 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4602 /* call prefetch hint to optimize cache load*/
4603 cache_prefetch ((__m128i*)src);
4604 cache_prefetch ((__m128i*)dst);
4608 /* fill cache line with next memory */
4609 cache_prefetch_next ((__m128i*)src);
4610 cache_prefetch_next ((__m128i*)dst);
4612 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4614 opaque = is_opaque (xmm_src_hi);
4615 zero = is_zero (xmm_src_hi);
4617 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4621 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4622 &xmm_dst_lo, &xmm_dst_hi);
4625 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4629 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4631 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4633 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4634 &xmm_dst_lo, &xmm_dst_hi);
4637 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4650 *dst++ = pack_1x64_32 (
4651 over_rev_non_pre_1x64 (
4652 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4661 /* -------------------------------------------------------------------------------------------------
4662 * composite_over_n_8888_0565_ca
4666 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4668 pixman_image_t * src_image,
4669 pixman_image_t * mask_image,
4670 pixman_image_t * dst_image,
4681 uint16_t *dst_line, *dst, d;
4682 uint32_t *mask_line, *mask, m;
4683 int dst_stride, mask_stride;
4687 __m128i xmm_src, xmm_alpha;
4688 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4689 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4691 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4693 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4698 PIXMAN_IMAGE_GET_LINE (
4699 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4700 PIXMAN_IMAGE_GET_LINE (
4701 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4703 xmm_src = expand_pixel_32_1x128 (src);
4704 xmm_alpha = expand_alpha_1x128 (xmm_src);
4705 mmx_src = _mm_movepi64_pi64 (xmm_src);
4706 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4713 mask_line += mask_stride;
4714 dst_line += dst_stride;
4716 /* call prefetch hint to optimize cache load*/
4717 cache_prefetch ((__m128i*)mask);
4718 cache_prefetch ((__m128i*)dst);
4720 while (w && ((unsigned long)dst & 15))
4722 m = *(uint32_t *) mask;
4727 mmx_mask = unpack_32_1x64 (m);
4728 mmx_dest = expand565_16_1x64 (d);
4730 *dst = pack_565_32_16 (
4733 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4741 /* call prefetch hint to optimize cache load*/
4742 cache_prefetch ((__m128i*)mask);
4743 cache_prefetch ((__m128i*)dst);
4747 /* fill cache line with next memory */
4748 cache_prefetch_next ((__m128i*)mask);
4749 cache_prefetch_next ((__m128i*)dst);
4752 xmm_mask = load_128_unaligned ((__m128i*)mask);
4753 xmm_dst = load_128_aligned ((__m128i*)dst);
4755 pack_cmp = _mm_movemask_epi8 (
4756 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4758 unpack_565_128_4x128 (xmm_dst,
4759 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4760 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4762 /* preload next round */
4763 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4765 /* preload next round */
4766 if (pack_cmp != 0xffff)
4768 in_over_2x128 (&xmm_src, &xmm_src,
4769 &xmm_alpha, &xmm_alpha,
4770 &xmm_mask_lo, &xmm_mask_hi,
4771 &xmm_dst0, &xmm_dst1);
4775 pack_cmp = _mm_movemask_epi8 (
4776 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4778 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4780 if (pack_cmp != 0xffff)
4782 in_over_2x128 (&xmm_src, &xmm_src,
4783 &xmm_alpha, &xmm_alpha,
4784 &xmm_mask_lo, &xmm_mask_hi,
4785 &xmm_dst2, &xmm_dst3);
4789 (__m128i*)dst, pack_565_4x128_128 (
4790 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4799 m = *(uint32_t *) mask;
4804 mmx_mask = unpack_32_1x64 (m);
4805 mmx_dest = expand565_16_1x64 (d);
4807 *dst = pack_565_32_16 (
4810 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4822 /* -----------------------------------------------------------------------
4823 * composite_in_n_8_8
4827 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4829 pixman_image_t * src_image,
4830 pixman_image_t * mask_image,
4831 pixman_image_t * dst_image,
4841 uint8_t *dst_line, *dst;
4842 uint8_t *mask_line, *mask;
4843 int dst_stride, mask_stride;
4849 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4850 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4852 PIXMAN_IMAGE_GET_LINE (
4853 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4854 PIXMAN_IMAGE_GET_LINE (
4855 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4857 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4861 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4866 dst_line += dst_stride;
4868 mask_line += mask_stride;
4871 /* call prefetch hint to optimize cache load*/
4872 cache_prefetch ((__m128i*)mask);
4873 cache_prefetch ((__m128i*)dst);
4875 while (w && ((unsigned long)dst & 15))
4877 m = (uint32_t) *mask++;
4878 d = (uint32_t) *dst;
4880 *dst++ = (uint8_t) pack_1x64_32 (
4882 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4883 unpack_32_1x64 (m)),
4884 unpack_32_1x64 (d)));
4888 /* call prefetch hint to optimize cache load*/
4889 cache_prefetch ((__m128i*)mask);
4890 cache_prefetch ((__m128i*)dst);
4894 /* fill cache line with next memory */
4895 cache_prefetch_next ((__m128i*)mask);
4896 cache_prefetch_next ((__m128i*)dst);
4898 xmm_mask = load_128_unaligned ((__m128i*)mask);
4899 xmm_dst = load_128_aligned ((__m128i*)dst);
4901 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4902 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4904 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4905 &xmm_mask_lo, &xmm_mask_hi,
4906 &xmm_mask_lo, &xmm_mask_hi);
4908 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4909 &xmm_dst_lo, &xmm_dst_hi,
4910 &xmm_dst_lo, &xmm_dst_hi);
4913 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4922 m = (uint32_t) *mask++;
4923 d = (uint32_t) *dst;
4925 *dst++ = (uint8_t) pack_1x64_32 (
4928 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4929 unpack_32_1x64 (d)));
4937 /* ---------------------------------------------------------------------------
4942 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4944 pixman_image_t * src_image,
4945 pixman_image_t * mask_image,
4946 pixman_image_t * dst_image,
4956 uint8_t *dst_line, *dst;
4957 uint8_t *src_line, *src;
4958 int src_stride, dst_stride;
4962 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4963 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4965 PIXMAN_IMAGE_GET_LINE (
4966 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4967 PIXMAN_IMAGE_GET_LINE (
4968 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4973 dst_line += dst_stride;
4975 src_line += src_stride;
4978 /* call prefetch hint to optimize cache load*/
4979 cache_prefetch ((__m128i*)src);
4980 cache_prefetch ((__m128i*)dst);
4982 while (w && ((unsigned long)dst & 15))
4984 s = (uint32_t) *src++;
4985 d = (uint32_t) *dst;
4987 *dst++ = (uint8_t) pack_1x64_32 (
4989 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4993 /* call prefetch hint to optimize cache load*/
4994 cache_prefetch ((__m128i*)src);
4995 cache_prefetch ((__m128i*)dst);
4999 /* fill cache line with next memory */
5000 cache_prefetch_next ((__m128i*)src);
5001 cache_prefetch_next ((__m128i*)dst);
5003 xmm_src = load_128_unaligned ((__m128i*)src);
5004 xmm_dst = load_128_aligned ((__m128i*)dst);
5006 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5007 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5009 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5010 &xmm_dst_lo, &xmm_dst_hi,
5011 &xmm_dst_lo, &xmm_dst_hi);
5014 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5023 s = (uint32_t) *src++;
5024 d = (uint32_t) *dst;
5026 *dst++ = (uint8_t) pack_1x64_32 (
5027 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5035 /* -------------------------------------------------------------------------
5036 * composite_add_8888_8_8
5040 sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
5042 pixman_image_t * src_image,
5043 pixman_image_t * mask_image,
5044 pixman_image_t * dst_image,
5054 uint8_t *dst_line, *dst;
5055 uint8_t *mask_line, *mask;
5056 int dst_stride, mask_stride;
5063 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5064 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5066 PIXMAN_IMAGE_GET_LINE (
5067 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5068 PIXMAN_IMAGE_GET_LINE (
5069 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5071 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5075 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5080 dst_line += dst_stride;
5082 mask_line += mask_stride;
5085 /* call prefetch hint to optimize cache load*/
5086 cache_prefetch ((__m128i*)mask);
5087 cache_prefetch ((__m128i*)dst);
5089 while (w && ((unsigned long)dst & 15))
5091 m = (uint32_t) *mask++;
5092 d = (uint32_t) *dst;
5094 *dst++ = (uint8_t) pack_1x64_32 (
5097 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5098 unpack_32_1x64 (d)));
5102 /* call prefetch hint to optimize cache load*/
5103 cache_prefetch ((__m128i*)mask);
5104 cache_prefetch ((__m128i*)dst);
5108 /* fill cache line with next memory */
5109 cache_prefetch_next ((__m128i*)mask);
5110 cache_prefetch_next ((__m128i*)dst);
5112 xmm_mask = load_128_unaligned ((__m128i*)mask);
5113 xmm_dst = load_128_aligned ((__m128i*)dst);
5115 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5116 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5118 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5119 &xmm_mask_lo, &xmm_mask_hi,
5120 &xmm_mask_lo, &xmm_mask_hi);
5122 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5123 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5126 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5135 m = (uint32_t) *mask++;
5136 d = (uint32_t) *dst;
5138 *dst++ = (uint8_t) pack_1x64_32 (
5141 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5142 unpack_32_1x64 (d)));
5151 /* ----------------------------------------------------------------------
5152 * composite_add_8000_8000
5156 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5158 pixman_image_t * src_image,
5159 pixman_image_t * mask_image,
5160 pixman_image_t * dst_image,
5170 uint8_t *dst_line, *dst;
5171 uint8_t *src_line, *src;
5172 int dst_stride, src_stride;
5176 PIXMAN_IMAGE_GET_LINE (
5177 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5178 PIXMAN_IMAGE_GET_LINE (
5179 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5186 /* call prefetch hint to optimize cache load*/
5187 cache_prefetch ((__m128i*)src);
5188 cache_prefetch ((__m128i*)dst);
5190 dst_line += dst_stride;
5191 src_line += src_stride;
5195 while (w && (unsigned long)dst & 3)
5197 t = (*dst) + (*src++);
5198 *dst++ = t | (0 - (t >> 8));
5202 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5212 t = (*dst) + (*src++);
5213 *dst++ = t | (0 - (t >> 8));
5221 /* ---------------------------------------------------------------------
5222 * composite_add_8888_8888
5225 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5227 pixman_image_t * src_image,
5228 pixman_image_t * mask_image,
5229 pixman_image_t * dst_image,
5239 uint32_t *dst_line, *dst;
5240 uint32_t *src_line, *src;
5241 int dst_stride, src_stride;
5243 PIXMAN_IMAGE_GET_LINE (
5244 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5245 PIXMAN_IMAGE_GET_LINE (
5246 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5251 dst_line += dst_stride;
5253 src_line += src_stride;
5255 core_combine_add_u_sse2 (dst, src, NULL, width);
5261 /* -------------------------------------------------------------------------------------------------
5262 * sse2_composite_copy_area
5265 static pixman_bool_t
5266 pixman_blt_sse2 (uint32_t *src_bits,
5279 uint8_t * src_bytes;
5280 uint8_t * dst_bytes;
5283 if (src_bpp != dst_bpp)
5288 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5289 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5290 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5291 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5292 byte_width = 2 * width;
5296 else if (src_bpp == 32)
5298 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5299 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5300 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5301 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5302 byte_width = 4 * width;
5311 cache_prefetch ((__m128i*)src_bytes);
5312 cache_prefetch ((__m128i*)dst_bytes);
5317 uint8_t *s = src_bytes;
5318 uint8_t *d = dst_bytes;
5319 src_bytes += src_stride;
5320 dst_bytes += dst_stride;
5323 cache_prefetch_next ((__m128i*)s);
5324 cache_prefetch_next ((__m128i*)d);
5326 while (w >= 2 && ((unsigned long)d & 3))
5328 *(uint16_t *)d = *(uint16_t *)s;
5334 while (w >= 4 && ((unsigned long)d & 15))
5336 *(uint32_t *)d = *(uint32_t *)s;
5343 cache_prefetch_next ((__m128i*)s);
5344 cache_prefetch_next ((__m128i*)d);
5348 __m128i xmm0, xmm1, xmm2, xmm3;
5350 /* 128 bytes ahead */
5351 cache_prefetch (((__m128i*)s) + 8);
5352 cache_prefetch (((__m128i*)d) + 8);
5354 xmm0 = load_128_unaligned ((__m128i*)(s));
5355 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5356 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5357 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5359 save_128_aligned ((__m128i*)(d), xmm0);
5360 save_128_aligned ((__m128i*)(d + 16), xmm1);
5361 save_128_aligned ((__m128i*)(d + 32), xmm2);
5362 save_128_aligned ((__m128i*)(d + 48), xmm3);
5369 cache_prefetch_next ((__m128i*)s);
5370 cache_prefetch_next ((__m128i*)d);
5374 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5381 cache_prefetch_next ((__m128i*)s);
5382 cache_prefetch_next ((__m128i*)d);
5386 *(uint32_t *)d = *(uint32_t *)s;
5395 *(uint16_t *)d = *(uint16_t *)s;
5408 sse2_composite_copy_area (pixman_implementation_t *imp,
5410 pixman_image_t * src_image,
5411 pixman_image_t * mask_image,
5412 pixman_image_t * dst_image,
5422 pixman_blt_sse2 (src_image->bits.bits,
5423 dst_image->bits.bits,
5424 src_image->bits.rowstride,
5425 dst_image->bits.rowstride,
5426 PIXMAN_FORMAT_BPP (src_image->bits.format),
5427 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5428 src_x, src_y, dest_x, dest_y, width, height);
5432 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
5434 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5436 pixman_image_t * src_image,
5437 pixman_image_t * mask_image,
5438 pixman_image_t * dst_image,
5448 uint32_t *src, *src_line, s;
5449 uint32_t *dst, *dst_line, d;
5450 uint8_t *mask, *mask_line;
5452 int src_stride, mask_stride, dst_stride;
5455 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5456 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5457 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5459 PIXMAN_IMAGE_GET_LINE (
5460 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5461 PIXMAN_IMAGE_GET_LINE (
5462 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5463 PIXMAN_IMAGE_GET_LINE (
5464 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5469 src_line += src_stride;
5471 dst_line += dst_stride;
5473 mask_line += mask_stride;
5477 /* call prefetch hint to optimize cache load*/
5478 cache_prefetch ((__m128i*)src);
5479 cache_prefetch ((__m128i*)dst);
5480 cache_prefetch ((__m128i*)mask);
5482 while (w && (unsigned long)dst & 15)
5484 s = 0xff000000 | *src++;
5485 m = (uint32_t) *mask++;
5488 __m64 ms = unpack_32_1x64 (s);
5492 ms = in_over_1x64 (ms,
5494 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5495 unpack_32_1x64 (d));
5498 *dst++ = pack_1x64_32 (ms);
5502 /* call prefetch hint to optimize cache load*/
5503 cache_prefetch ((__m128i*)src);
5504 cache_prefetch ((__m128i*)dst);
5505 cache_prefetch ((__m128i*)mask);
5509 /* fill cache line with next memory */
5510 cache_prefetch_next ((__m128i*)src);
5511 cache_prefetch_next ((__m128i*)dst);
5512 cache_prefetch_next ((__m128i*)mask);
5514 m = *(uint32_t*) mask;
5515 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5517 if (m == 0xffffffff)
5519 save_128_aligned ((__m128i*)dst, xmm_src);
5523 xmm_dst = load_128_aligned ((__m128i*)dst);
5525 xmm_mask = _mm_unpacklo_epi16 (
5526 unpack_32_1x128 (m), _mm_setzero_si128 ());
5528 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5529 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5530 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5532 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
5533 &xmm_mask_lo, &xmm_mask_hi);
5535 in_over_2x128 (xmm_src_lo, xmm_src_hi,
5536 mask_00ff, mask_00ff,
5537 xmm_mask_lo, xmm_mask_hi,
5538 &xmm_dst_lo, &xmm_dst_hi);
5541 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5552 m = (uint32_t) *mask++;
5556 s = 0xff000000 | *src;
5566 *dst = pack_1x64_32 (
5570 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5571 unpack_32_1x64 (d)));
5587 static const pixman_fast_path_t sse2_fast_paths[] =
5589 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 },
5590 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 },
5591 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 },
5592 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 },
5593 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 },
5594 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
5595 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
5596 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
5597 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
5598 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
5599 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
5600 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5601 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5602 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5603 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5605 /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
5606 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5607 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5608 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
5609 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5611 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5612 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5613 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5614 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5615 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5616 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5617 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5618 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5619 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5620 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5621 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5622 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5623 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5624 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5625 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5626 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5627 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5628 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5629 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5630 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5631 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5632 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5633 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5634 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5635 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5636 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5637 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5638 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5640 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5641 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 },
5642 { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 },
5643 { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 },
5644 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_8888_8_8, 0 },
5646 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5647 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5648 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5649 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5650 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 },
5651 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 },
5652 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5653 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5654 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5655 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5656 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 },
5657 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 },
5659 { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 },
5660 { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 },
5666 * Work around GCC bug causing crashes in Mozilla with SSE2
5668 * When using -msse, gcc generates movdqa instructions assuming that
5669 * the stack is 16 byte aligned. Unfortunately some applications, such
5670 * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
5671 * causes the movdqa instructions to fail.
5673 * The __force_align_arg_pointer__ makes gcc generate a prologue that
5674 * realigns the stack pointer to 16 bytes.
5676 * On x86-64 this is not necessary because the standard ABI already
5677 * calls for a 16 byte aligned stack.
5679 * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
5681 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5682 __attribute__((__force_align_arg_pointer__))
5685 sse2_composite (pixman_implementation_t *imp,
5687 pixman_image_t * src,
5688 pixman_image_t * mask,
5689 pixman_image_t * dest,
5699 if (_pixman_run_fast_path (sse2_fast_paths, imp,
5700 op, src, mask, dest,
5709 _pixman_implementation_composite (imp->delegate, op,
5717 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5718 __attribute__((__force_align_arg_pointer__))
5720 static pixman_bool_t
5721 sse2_blt (pixman_implementation_t *imp,
5722 uint32_t * src_bits,
5723 uint32_t * dst_bits,
5735 if (!pixman_blt_sse2 (
5736 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5737 src_x, src_y, dst_x, dst_y, width, height))
5740 return _pixman_implementation_blt (
5742 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5743 src_x, src_y, dst_x, dst_y, width, height);
5749 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5750 __attribute__((__force_align_arg_pointer__))
5752 static pixman_bool_t
5753 sse2_fill (pixman_implementation_t *imp,
5763 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5765 return _pixman_implementation_fill (
5766 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5772 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5773 __attribute__((__force_align_arg_pointer__))
5775 pixman_implementation_t *
5776 _pixman_implementation_create_sse2 (void)
5778 pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5779 pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5781 /* SSE2 constants */
5782 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5783 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5784 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5785 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5786 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5787 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5788 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5789 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5790 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
5791 mask_0080 = create_mask_16_128 (0x0080);
5792 mask_00ff = create_mask_16_128 (0x00ff);
5793 mask_0101 = create_mask_16_128 (0x0101);
5794 mask_ffff = create_mask_16_128 (0xffff);
5795 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5796 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5799 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5800 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5802 mask_x0080 = create_mask_16_64 (0x0080);
5803 mask_x00ff = create_mask_16_64 (0x00ff);
5804 mask_x0101 = create_mask_16_64 (0x0101);
5805 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5809 /* Set up function pointers */
5811 /* SSE code patch for fbcompose.c */
5812 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5813 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5814 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5815 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5816 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5817 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5818 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5819 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5820 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5821 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5823 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5825 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5826 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5827 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5828 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5829 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5830 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5831 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5832 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5833 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5834 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5835 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5837 imp->composite = sse2_composite;
5838 imp->blt = sse2_blt;
5839 imp->fill = sse2_fill;
5844 #endif /* USE_SSE2 */