2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
41 /* --------------------------------------------------------------------
45 static __m64 mask_x0080;
46 static __m64 mask_x00ff;
47 static __m64 mask_x0101;
48 static __m64 mask_x_alpha;
50 static __m64 mask_x565_rgb;
51 static __m64 mask_x565_unpack;
53 static __m128i mask_0080;
54 static __m128i mask_00ff;
55 static __m128i mask_0101;
56 static __m128i mask_ffff;
57 static __m128i mask_ff000000;
58 static __m128i mask_alpha;
60 static __m128i mask_565_r;
61 static __m128i mask_565_g1, mask_565_g2;
62 static __m128i mask_565_b;
63 static __m128i mask_red;
64 static __m128i mask_green;
65 static __m128i mask_blue;
67 static __m128i mask_565_fix_rb;
68 static __m128i mask_565_fix_g;
70 /* ----------------------------------------------------------------------
73 static force_inline __m128i
74 unpack_32_1x128 (uint32_t data)
76 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
79 static force_inline void
80 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
82 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
83 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
86 static force_inline __m128i
87 unpack_565_to_8888 (__m128i lo)
89 __m128i r, g, b, rb, t;
91 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
92 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
93 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
95 rb = _mm_or_si128 (r, b);
96 t = _mm_and_si128 (rb, mask_565_fix_rb);
97 t = _mm_srli_epi32 (t, 5);
98 rb = _mm_or_si128 (rb, t);
100 t = _mm_and_si128 (g, mask_565_fix_g);
101 t = _mm_srli_epi32 (t, 6);
102 g = _mm_or_si128 (g, t);
104 return _mm_or_si128 (rb, g);
107 static force_inline void
108 unpack_565_128_4x128 (__m128i data,
116 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
117 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
119 lo = unpack_565_to_8888 (lo);
120 hi = unpack_565_to_8888 (hi);
122 unpack_128_2x128 (lo, data0, data1);
123 unpack_128_2x128 (hi, data2, data3);
126 static force_inline uint16_t
127 pack_565_32_16 (uint32_t pixel)
129 return (uint16_t) (((pixel >> 8) & 0xf800) |
130 ((pixel >> 5) & 0x07e0) |
131 ((pixel >> 3) & 0x001f));
134 static force_inline __m128i
135 pack_2x128_128 (__m128i lo, __m128i hi)
137 return _mm_packus_epi16 (lo, hi);
140 static force_inline __m128i
141 pack_565_2x128_128 (__m128i lo, __m128i hi)
144 __m128i r, g1, g2, b;
146 data = pack_2x128_128 (lo, hi);
148 r = _mm_and_si128 (data, mask_565_r);
149 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
150 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
151 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
153 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
156 static force_inline __m128i
157 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
159 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
160 pack_565_2x128_128 (*xmm2, *xmm3));
163 static force_inline int
164 is_opaque (__m128i x)
166 __m128i ffs = _mm_cmpeq_epi8 (x, x);
168 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
171 static force_inline int
174 return _mm_movemask_epi8 (
175 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
178 static force_inline int
179 is_transparent (__m128i x)
181 return (_mm_movemask_epi8 (
182 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
185 static force_inline __m128i
186 expand_pixel_32_1x128 (uint32_t data)
188 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
191 static force_inline __m128i
192 expand_alpha_1x128 (__m128i data)
194 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
195 _MM_SHUFFLE (3, 3, 3, 3)),
196 _MM_SHUFFLE (3, 3, 3, 3));
199 static force_inline void
200 expand_alpha_2x128 (__m128i data_lo,
207 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
208 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
210 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
211 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
214 static force_inline void
215 expand_alpha_rev_2x128 (__m128i data_lo,
222 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
223 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
224 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
225 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
228 static force_inline void
229 pix_multiply_2x128 (__m128i* data_lo,
238 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
239 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
240 lo = _mm_adds_epu16 (lo, mask_0080);
241 hi = _mm_adds_epu16 (hi, mask_0080);
242 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
243 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
246 static force_inline void
247 pix_add_multiply_2x128 (__m128i* src_lo,
249 __m128i* alpha_dst_lo,
250 __m128i* alpha_dst_hi,
253 __m128i* alpha_src_lo,
254 __m128i* alpha_src_hi,
259 __m128i mul_lo, mul_hi;
261 lo = _mm_mullo_epi16 (*src_lo, *alpha_dst_lo);
262 hi = _mm_mullo_epi16 (*src_hi, *alpha_dst_hi);
263 mul_lo = _mm_mullo_epi16 (*dst_lo, *alpha_src_lo);
264 mul_hi = _mm_mullo_epi16 (*dst_hi, *alpha_src_hi);
265 lo = _mm_adds_epu16 (lo, mask_0080);
266 hi = _mm_adds_epu16 (hi, mask_0080);
267 lo = _mm_adds_epu16 (lo, mul_lo);
268 hi = _mm_adds_epu16 (hi, mul_hi);
269 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
270 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
273 static force_inline void
274 negate_2x128 (__m128i data_lo,
279 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
280 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
283 static force_inline void
284 invert_colors_2x128 (__m128i data_lo,
291 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
292 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
293 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
294 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
297 static force_inline void
298 over_2x128 (__m128i* src_lo,
307 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
309 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
311 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
312 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
315 static force_inline void
316 over_rev_non_pre_2x128 (__m128i src_lo,
322 __m128i alpha_lo, alpha_hi;
324 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
326 lo = _mm_or_si128 (alpha_lo, mask_alpha);
327 hi = _mm_or_si128 (alpha_hi, mask_alpha);
329 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
331 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
333 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
336 static force_inline void
337 in_over_2x128 (__m128i* src_lo,
349 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
350 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
352 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
355 static force_inline void
356 cache_prefetch (__m128i* addr)
358 _mm_prefetch (addr, _MM_HINT_T0);
361 static force_inline void
362 cache_prefetch_next (__m128i* addr)
364 _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
367 /* load 4 pixels from a 16-byte boundary aligned address */
368 static force_inline __m128i
369 load_128_aligned (__m128i* src)
371 return _mm_load_si128 (src);
374 /* load 4 pixels from a unaligned address */
375 static force_inline __m128i
376 load_128_unaligned (const __m128i* src)
378 return _mm_loadu_si128 (src);
381 /* save 4 pixels using Write Combining memory on a 16-byte
382 * boundary aligned address
384 static force_inline void
385 save_128_write_combining (__m128i* dst,
388 _mm_stream_si128 (dst, data);
391 /* save 4 pixels on a 16-byte boundary aligned address */
392 static force_inline void
393 save_128_aligned (__m128i* dst,
396 _mm_store_si128 (dst, data);
399 /* save 4 pixels on a unaligned address */
400 static force_inline void
401 save_128_unaligned (__m128i* dst,
404 _mm_storeu_si128 (dst, data);
407 /* ------------------------------------------------------------------
411 static force_inline __m64
412 unpack_32_1x64 (uint32_t data)
414 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
417 static force_inline __m64
418 expand_alpha_1x64 (__m64 data)
420 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
423 static force_inline __m64
424 expand_alpha_rev_1x64 (__m64 data)
426 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
429 static force_inline __m64
430 expand_pixel_8_1x64 (uint8_t data)
432 return _mm_shuffle_pi16 (
433 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
436 static force_inline __m64
437 pix_multiply_1x64 (__m64 data,
440 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
445 static force_inline __m64
446 pix_add_multiply_1x64 (__m64* src,
451 return _mm_mulhi_pu16 (
452 _mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alpha_dst),
454 _mm_mullo_pi16 (*dst, *alpha_src)),
458 static force_inline __m64
459 negate_1x64 (__m64 data)
461 return _mm_xor_si64 (data, mask_x00ff);
464 static force_inline __m64
465 invert_colors_1x64 (__m64 data)
467 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
470 static force_inline __m64
471 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
473 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
476 static force_inline __m64
477 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
479 return over_1x64 (pix_multiply_1x64 (*src, *mask),
480 pix_multiply_1x64 (*alpha, *mask),
484 static force_inline __m64
485 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
487 __m64 alpha = expand_alpha_1x64 (src);
489 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
490 _mm_or_si64 (alpha, mask_x_alpha)),
495 static force_inline uint32_t
496 pack_1x64_32 (__m64 data)
498 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
501 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
505 * --- Expanding 565 in the low word ---
507 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
508 * m = m & (01f0003f001f);
509 * m = m * (008404100840);
512 * Note the trick here - the top word is shifted by another nibble to
513 * avoid it bumping into the middle word
515 static force_inline __m64
516 expand565_16_1x64 (uint16_t pixel)
521 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
523 t1 = _mm_slli_si64 (p, 36 - 11);
524 t2 = _mm_slli_si64 (p, 16 - 5);
526 p = _mm_or_si64 (t1, p);
527 p = _mm_or_si64 (t2, p);
528 p = _mm_and_si64 (p, mask_x565_rgb);
529 p = _mm_mullo_pi16 (p, mask_x565_unpack);
531 return _mm_srli_pi16 (p, 8);
534 /* ----------------------------------------------------------------------------
535 * Compose Core transformations
537 static force_inline uint32_t
538 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
551 ms = unpack_32_1x64 (src);
552 return pack_1x64_32 (
553 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
559 static force_inline uint32_t
560 combine1 (const uint32_t *ps, const uint32_t *pm)
568 mm = unpack_32_1x64 (*pm);
569 mm = expand_alpha_1x64 (mm);
571 ms = unpack_32_1x64 (s);
572 ms = pix_multiply_1x64 (ms, mm);
574 s = pack_1x64_32 (ms);
580 static force_inline __m128i
581 combine4 (const __m128i *ps, const __m128i *pm)
583 __m128i xmm_src_lo, xmm_src_hi;
584 __m128i xmm_msk_lo, xmm_msk_hi;
589 xmm_msk_lo = load_128_unaligned (pm);
591 if (is_transparent (xmm_msk_lo))
592 return _mm_setzero_si128 ();
595 s = load_128_unaligned (ps);
599 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
600 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
602 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
604 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
605 &xmm_msk_lo, &xmm_msk_hi,
606 &xmm_src_lo, &xmm_src_hi);
608 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
614 static force_inline void
615 core_combine_over_u_sse2 (uint32_t* pd,
622 __m128i xmm_dst_lo, xmm_dst_hi;
623 __m128i xmm_src_lo, xmm_src_hi;
624 __m128i xmm_alpha_lo, xmm_alpha_hi;
626 /* call prefetch hint to optimize cache load*/
627 cache_prefetch ((__m128i*)ps);
628 cache_prefetch ((__m128i*)pd);
629 cache_prefetch ((__m128i*)pm);
631 /* Align dst on a 16-byte boundary */
633 ((unsigned long)pd & 15))
636 s = combine1 (ps, pm);
638 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
645 /* call prefetch hint to optimize cache load*/
646 cache_prefetch ((__m128i*)ps);
647 cache_prefetch ((__m128i*)pd);
648 cache_prefetch ((__m128i*)pm);
652 /* fill cache line with next memory */
653 cache_prefetch_next ((__m128i*)ps);
654 cache_prefetch_next ((__m128i*)pd);
655 cache_prefetch_next ((__m128i*)pm);
657 /* I'm loading unaligned because I'm not sure about
658 * the address alignment.
660 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
662 if (is_opaque (xmm_src_hi))
664 save_128_aligned ((__m128i*)pd, xmm_src_hi);
666 else if (!is_zero (xmm_src_hi))
668 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
670 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
671 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
674 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
676 over_2x128 (&xmm_src_lo, &xmm_src_hi,
677 &xmm_alpha_lo, &xmm_alpha_hi,
678 &xmm_dst_lo, &xmm_dst_hi);
680 /* rebuid the 4 pixel data and save*/
681 save_128_aligned ((__m128i*)pd,
682 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
695 s = combine1 (ps, pm);
697 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
706 static force_inline void
707 core_combine_over_reverse_u_sse2 (uint32_t* pd,
714 __m128i xmm_dst_lo, xmm_dst_hi;
715 __m128i xmm_src_lo, xmm_src_hi;
716 __m128i xmm_alpha_lo, xmm_alpha_hi;
718 /* call prefetch hint to optimize cache load*/
719 cache_prefetch ((__m128i*)ps);
720 cache_prefetch ((__m128i*)pd);
721 cache_prefetch ((__m128i*)pm);
723 /* Align dst on a 16-byte boundary */
725 ((unsigned long)pd & 15))
728 s = combine1 (ps, pm);
730 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
737 /* call prefetch hint to optimize cache load*/
738 cache_prefetch ((__m128i*)ps);
739 cache_prefetch ((__m128i*)pd);
740 cache_prefetch ((__m128i*)pm);
744 /* fill cache line with next memory */
745 cache_prefetch_next ((__m128i*)ps);
746 cache_prefetch_next ((__m128i*)pd);
747 cache_prefetch_next ((__m128i*)pm);
749 /* I'm loading unaligned because I'm not sure
750 * about the address alignment.
752 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
753 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
755 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
756 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
758 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
759 &xmm_alpha_lo, &xmm_alpha_hi);
761 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
762 &xmm_alpha_lo, &xmm_alpha_hi,
763 &xmm_src_lo, &xmm_src_hi);
765 /* rebuid the 4 pixel data and save*/
766 save_128_aligned ((__m128i*)pd,
767 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
780 s = combine1 (ps, pm);
782 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
790 static force_inline uint32_t
791 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
793 uint32_t maska = src >> 24;
799 else if (maska != 0xff)
801 return pack_1x64_32 (
802 pix_multiply_1x64 (unpack_32_1x64 (dst),
803 expand_alpha_1x64 (unpack_32_1x64 (src))));
809 static force_inline void
810 core_combine_in_u_sse2 (uint32_t* pd,
817 __m128i xmm_src_lo, xmm_src_hi;
818 __m128i xmm_dst_lo, xmm_dst_hi;
820 /* call prefetch hint to optimize cache load*/
821 cache_prefetch ((__m128i*)ps);
822 cache_prefetch ((__m128i*)pd);
823 cache_prefetch ((__m128i*)pm);
825 while (w && ((unsigned long) pd & 15))
827 s = combine1 (ps, pm);
830 *pd++ = core_combine_in_u_pixelsse2 (d, s);
837 /* call prefetch hint to optimize cache load*/
838 cache_prefetch ((__m128i*)ps);
839 cache_prefetch ((__m128i*)pd);
840 cache_prefetch ((__m128i*)pm);
844 /* fill cache line with next memory */
845 cache_prefetch_next ((__m128i*)ps);
846 cache_prefetch_next ((__m128i*)pd);
847 cache_prefetch_next ((__m128i*)pm);
849 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
850 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
852 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
853 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
855 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
856 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
857 &xmm_dst_lo, &xmm_dst_hi,
858 &xmm_dst_lo, &xmm_dst_hi);
860 save_128_aligned ((__m128i*)pd,
861 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
872 s = combine1 (ps, pm);
875 *pd++ = core_combine_in_u_pixelsse2 (d, s);
883 static force_inline void
884 core_combine_reverse_in_u_sse2 (uint32_t* pd,
891 __m128i xmm_src_lo, xmm_src_hi;
892 __m128i xmm_dst_lo, xmm_dst_hi;
894 /* call prefetch hint to optimize cache load*/
895 cache_prefetch ((__m128i*)ps);
896 cache_prefetch ((__m128i*)pd);
897 cache_prefetch ((__m128i*)pm);
899 while (w && ((unsigned long) pd & 15))
901 s = combine1 (ps, pm);
904 *pd++ = core_combine_in_u_pixelsse2 (s, d);
911 /* call prefetch hint to optimize cache load*/
912 cache_prefetch ((__m128i*)ps);
913 cache_prefetch ((__m128i*)pd);
914 cache_prefetch ((__m128i*)pm);
918 /* fill cache line with next memory */
919 cache_prefetch_next ((__m128i*)ps);
920 cache_prefetch_next ((__m128i*)pd);
921 cache_prefetch_next ((__m128i*)pm);
923 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
924 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
926 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
927 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
929 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
930 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
931 &xmm_src_lo, &xmm_src_hi,
932 &xmm_dst_lo, &xmm_dst_hi);
935 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
946 s = combine1 (ps, pm);
949 *pd++ = core_combine_in_u_pixelsse2 (s, d);
957 static force_inline void
958 core_combine_reverse_out_u_sse2 (uint32_t* pd,
963 /* call prefetch hint to optimize cache load*/
964 cache_prefetch ((__m128i*)ps);
965 cache_prefetch ((__m128i*)pd);
966 cache_prefetch ((__m128i*)pm);
968 while (w && ((unsigned long) pd & 15))
970 uint32_t s = combine1 (ps, pm);
973 *pd++ = pack_1x64_32 (
975 unpack_32_1x64 (d), negate_1x64 (
976 expand_alpha_1x64 (unpack_32_1x64 (s)))));
984 /* call prefetch hint to optimize cache load*/
985 cache_prefetch ((__m128i*)ps);
986 cache_prefetch ((__m128i*)pd);
987 cache_prefetch ((__m128i*)pm);
991 __m128i xmm_src_lo, xmm_src_hi;
992 __m128i xmm_dst_lo, xmm_dst_hi;
994 /* fill cache line with next memory */
995 cache_prefetch_next ((__m128i*)ps);
996 cache_prefetch_next ((__m128i*)pd);
997 cache_prefetch_next ((__m128i*)pm);
999 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1000 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1002 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1003 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1005 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1006 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1008 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1009 &xmm_src_lo, &xmm_src_hi,
1010 &xmm_dst_lo, &xmm_dst_hi);
1013 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1025 uint32_t s = combine1 (ps, pm);
1028 *pd++ = pack_1x64_32 (
1030 unpack_32_1x64 (d), negate_1x64 (
1031 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1039 static force_inline void
1040 core_combine_out_u_sse2 (uint32_t* pd,
1045 /* call prefetch hint to optimize cache load*/
1046 cache_prefetch ((__m128i*)ps);
1047 cache_prefetch ((__m128i*)pd);
1048 cache_prefetch ((__m128i*)pm);
1050 while (w && ((unsigned long) pd & 15))
1052 uint32_t s = combine1 (ps, pm);
1055 *pd++ = pack_1x64_32 (
1057 unpack_32_1x64 (s), negate_1x64 (
1058 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1065 /* call prefetch hint to optimize cache load*/
1066 cache_prefetch ((__m128i*)ps);
1067 cache_prefetch ((__m128i*)pd);
1068 cache_prefetch ((__m128i*)pm);
1072 __m128i xmm_src_lo, xmm_src_hi;
1073 __m128i xmm_dst_lo, xmm_dst_hi;
1075 /* fill cache line with next memory */
1076 cache_prefetch_next ((__m128i*)ps);
1077 cache_prefetch_next ((__m128i*)pd);
1078 cache_prefetch_next ((__m128i*)pm);
1080 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1081 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1083 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1084 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1086 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1087 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1089 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1090 &xmm_dst_lo, &xmm_dst_hi,
1091 &xmm_dst_lo, &xmm_dst_hi);
1094 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1105 uint32_t s = combine1 (ps, pm);
1108 *pd++ = pack_1x64_32 (
1110 unpack_32_1x64 (s), negate_1x64 (
1111 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1119 static force_inline uint32_t
1120 core_combine_atop_u_pixel_sse2 (uint32_t src,
1123 __m64 s = unpack_32_1x64 (src);
1124 __m64 d = unpack_32_1x64 (dst);
1126 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1127 __m64 da = expand_alpha_1x64 (d);
1129 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1132 static force_inline void
1133 core_combine_atop_u_sse2 (uint32_t* pd,
1140 __m128i xmm_src_lo, xmm_src_hi;
1141 __m128i xmm_dst_lo, xmm_dst_hi;
1142 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1143 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1145 /* call prefetch hint to optimize cache load*/
1146 cache_prefetch ((__m128i*)ps);
1147 cache_prefetch ((__m128i*)pd);
1148 cache_prefetch ((__m128i*)pm);
1150 while (w && ((unsigned long) pd & 15))
1152 s = combine1 (ps, pm);
1155 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1162 /* call prefetch hint to optimize cache load*/
1163 cache_prefetch ((__m128i*)ps);
1164 cache_prefetch ((__m128i*)pd);
1165 cache_prefetch ((__m128i*)pm);
1169 /* fill cache line with next memory */
1170 cache_prefetch_next ((__m128i*)ps);
1171 cache_prefetch_next ((__m128i*)pd);
1172 cache_prefetch_next ((__m128i*)pm);
1174 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1175 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1177 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1178 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1180 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1181 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1182 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1183 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1185 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1186 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1188 pix_add_multiply_2x128 (
1189 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1190 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1191 &xmm_dst_lo, &xmm_dst_hi);
1194 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1205 s = combine1 (ps, pm);
1208 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1216 static force_inline uint32_t
1217 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1220 __m64 s = unpack_32_1x64 (src);
1221 __m64 d = unpack_32_1x64 (dst);
1223 __m64 sa = expand_alpha_1x64 (s);
1224 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1226 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1229 static force_inline void
1230 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1237 __m128i xmm_src_lo, xmm_src_hi;
1238 __m128i xmm_dst_lo, xmm_dst_hi;
1239 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1240 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1242 /* call prefetch hint to optimize cache load*/
1243 cache_prefetch ((__m128i*)ps);
1244 cache_prefetch ((__m128i*)pd);
1245 cache_prefetch ((__m128i*)pm);
1247 while (w && ((unsigned long) pd & 15))
1249 s = combine1 (ps, pm);
1252 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1259 /* call prefetch hint to optimize cache load*/
1260 cache_prefetch ((__m128i*)ps);
1261 cache_prefetch ((__m128i*)pd);
1262 cache_prefetch ((__m128i*)pm);
1266 /* fill cache line with next memory */
1267 cache_prefetch_next ((__m128i*)ps);
1268 cache_prefetch_next ((__m128i*)pd);
1269 cache_prefetch_next ((__m128i*)pm);
1271 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1272 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1274 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1275 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1277 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1278 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1279 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1280 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1282 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1283 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1285 pix_add_multiply_2x128 (
1286 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1287 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1288 &xmm_dst_lo, &xmm_dst_hi);
1291 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1302 s = combine1 (ps, pm);
1305 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1313 static force_inline uint32_t
1314 core_combine_xor_u_pixel_sse2 (uint32_t src,
1317 __m64 s = unpack_32_1x64 (src);
1318 __m64 d = unpack_32_1x64 (dst);
1320 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1321 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1323 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1326 static force_inline void
1327 core_combine_xor_u_sse2 (uint32_t* dst,
1328 const uint32_t* src,
1329 const uint32_t *mask,
1335 const uint32_t* ps = src;
1336 const uint32_t* pm = mask;
1338 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1339 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1340 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1341 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1343 /* call prefetch hint to optimize cache load*/
1344 cache_prefetch ((__m128i*)ps);
1345 cache_prefetch ((__m128i*)pd);
1346 cache_prefetch ((__m128i*)pm);
1348 while (w && ((unsigned long) pd & 15))
1350 s = combine1 (ps, pm);
1353 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1360 /* call prefetch hint to optimize cache load*/
1361 cache_prefetch ((__m128i*)ps);
1362 cache_prefetch ((__m128i*)pd);
1363 cache_prefetch ((__m128i*)pm);
1367 /* fill cache line with next memory */
1368 cache_prefetch_next ((__m128i*)ps);
1369 cache_prefetch_next ((__m128i*)pd);
1370 cache_prefetch_next ((__m128i*)pm);
1372 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1373 xmm_dst = load_128_aligned ((__m128i*) pd);
1375 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1376 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1378 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1379 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1380 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1381 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1383 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1384 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1385 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1386 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1388 pix_add_multiply_2x128 (
1389 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1390 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1391 &xmm_dst_lo, &xmm_dst_hi);
1394 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1405 s = combine1 (ps, pm);
1408 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1416 static force_inline void
1417 core_combine_add_u_sse2 (uint32_t* dst,
1418 const uint32_t* src,
1419 const uint32_t* mask,
1425 const uint32_t* ps = src;
1426 const uint32_t* pm = mask;
1428 /* call prefetch hint to optimize cache load*/
1429 cache_prefetch ((__m128i*)ps);
1430 cache_prefetch ((__m128i*)pd);
1431 cache_prefetch ((__m128i*)pm);
1433 while (w && (unsigned long)pd & 15)
1435 s = combine1 (ps, pm);
1441 *pd++ = _mm_cvtsi64_si32 (
1442 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1446 /* call prefetch hint to optimize cache load*/
1447 cache_prefetch ((__m128i*)ps);
1448 cache_prefetch ((__m128i*)pd);
1449 cache_prefetch ((__m128i*)pm);
1455 /* fill cache line with next memory */
1456 cache_prefetch_next ((__m128i*)ps);
1457 cache_prefetch_next ((__m128i*)pd);
1458 cache_prefetch_next ((__m128i*)pm);
1460 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1463 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1474 s = combine1 (ps, pm);
1478 *pd++ = _mm_cvtsi64_si32 (
1479 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1485 static force_inline uint32_t
1486 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1489 __m64 ms = unpack_32_1x64 (src);
1490 __m64 md = unpack_32_1x64 (dst);
1491 uint32_t sa = src >> 24;
1492 uint32_t da = ~dst >> 24;
1496 ms = pix_multiply_1x64 (
1497 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1500 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1503 static force_inline void
1504 core_combine_saturate_u_sse2 (uint32_t * pd,
1512 __m128i xmm_src, xmm_dst;
1514 /* call prefetch hint to optimize cache load*/
1515 cache_prefetch ((__m128i*)ps);
1516 cache_prefetch ((__m128i*)pd);
1517 cache_prefetch ((__m128i*)pm);
1519 while (w && (unsigned long)pd & 15)
1521 s = combine1 (ps, pm);
1524 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1531 /* call prefetch hint to optimize cache load*/
1532 cache_prefetch ((__m128i*)ps);
1533 cache_prefetch ((__m128i*)pd);
1534 cache_prefetch ((__m128i*)pm);
1538 /* fill cache line with next memory */
1539 cache_prefetch_next ((__m128i*)ps);
1540 cache_prefetch_next ((__m128i*)pd);
1541 cache_prefetch_next ((__m128i*)pm);
1543 xmm_dst = load_128_aligned ((__m128i*)pd);
1544 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1546 pack_cmp = _mm_movemask_epi8 (
1548 _mm_srli_epi32 (xmm_src, 24),
1549 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1551 /* if some alpha src is grater than respective ~alpha dst */
1554 s = combine1 (ps++, pm);
1556 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1560 s = combine1 (ps++, pm);
1562 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1566 s = combine1 (ps++, pm);
1568 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1572 s = combine1 (ps++, pm);
1574 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1580 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1593 s = combine1 (ps, pm);
1596 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1603 static force_inline void
1604 core_combine_src_ca_sse2 (uint32_t* pd,
1611 __m128i xmm_src_lo, xmm_src_hi;
1612 __m128i xmm_mask_lo, xmm_mask_hi;
1613 __m128i xmm_dst_lo, xmm_dst_hi;
1615 /* call prefetch hint to optimize cache load*/
1616 cache_prefetch ((__m128i*)ps);
1617 cache_prefetch ((__m128i*)pd);
1618 cache_prefetch ((__m128i*)pm);
1620 while (w && (unsigned long)pd & 15)
1624 *pd++ = pack_1x64_32 (
1625 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1629 /* call prefetch hint to optimize cache load*/
1630 cache_prefetch ((__m128i*)ps);
1631 cache_prefetch ((__m128i*)pd);
1632 cache_prefetch ((__m128i*)pm);
1636 /* fill cache line with next memory */
1637 cache_prefetch_next ((__m128i*)ps);
1638 cache_prefetch_next ((__m128i*)pd);
1639 cache_prefetch_next ((__m128i*)pm);
1641 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1642 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1644 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1645 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1647 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1648 &xmm_mask_lo, &xmm_mask_hi,
1649 &xmm_dst_lo, &xmm_dst_hi);
1652 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1664 *pd++ = pack_1x64_32 (
1665 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1670 static force_inline uint32_t
1671 core_combine_over_ca_pixel_sse2 (uint32_t src,
1675 __m64 s = unpack_32_1x64 (src);
1676 __m64 expAlpha = expand_alpha_1x64 (s);
1677 __m64 unpk_mask = unpack_32_1x64 (mask);
1678 __m64 unpk_dst = unpack_32_1x64 (dst);
1680 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1683 static force_inline void
1684 core_combine_over_ca_sse2 (uint32_t* pd,
1691 __m128i xmm_alpha_lo, xmm_alpha_hi;
1692 __m128i xmm_src_lo, xmm_src_hi;
1693 __m128i xmm_dst_lo, xmm_dst_hi;
1694 __m128i xmm_mask_lo, xmm_mask_hi;
1696 /* call prefetch hint to optimize cache load*/
1697 cache_prefetch ((__m128i*)ps);
1698 cache_prefetch ((__m128i*)pd);
1699 cache_prefetch ((__m128i*)pm);
1701 while (w && (unsigned long)pd & 15)
1707 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1711 /* call prefetch hint to optimize cache load*/
1712 cache_prefetch ((__m128i*)ps);
1713 cache_prefetch ((__m128i*)pd);
1714 cache_prefetch ((__m128i*)pm);
1718 /* fill cache line with next memory */
1719 cache_prefetch_next ((__m128i*)ps);
1720 cache_prefetch_next ((__m128i*)pd);
1721 cache_prefetch_next ((__m128i*)pm);
1723 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1724 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1725 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1727 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1728 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1729 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1731 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1732 &xmm_alpha_lo, &xmm_alpha_hi);
1734 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1735 &xmm_alpha_lo, &xmm_alpha_hi,
1736 &xmm_mask_lo, &xmm_mask_hi,
1737 &xmm_dst_lo, &xmm_dst_hi);
1740 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1754 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1759 static force_inline uint32_t
1760 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1764 __m64 d = unpack_32_1x64 (dst);
1766 return pack_1x64_32 (
1767 over_1x64 (d, expand_alpha_1x64 (d),
1768 pix_multiply_1x64 (unpack_32_1x64 (src),
1769 unpack_32_1x64 (mask))));
1772 static force_inline void
1773 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1780 __m128i xmm_alpha_lo, xmm_alpha_hi;
1781 __m128i xmm_src_lo, xmm_src_hi;
1782 __m128i xmm_dst_lo, xmm_dst_hi;
1783 __m128i xmm_mask_lo, xmm_mask_hi;
1785 /* call prefetch hint to optimize cache load*/
1786 cache_prefetch ((__m128i*)ps);
1787 cache_prefetch ((__m128i*)pd);
1788 cache_prefetch ((__m128i*)pm);
1790 while (w && (unsigned long)pd & 15)
1796 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1800 /* call prefetch hint to optimize cache load*/
1801 cache_prefetch ((__m128i*)ps);
1802 cache_prefetch ((__m128i*)pd);
1803 cache_prefetch ((__m128i*)pm);
1807 /* fill cache line with next memory */
1808 cache_prefetch_next ((__m128i*)ps);
1809 cache_prefetch_next ((__m128i*)pd);
1810 cache_prefetch_next ((__m128i*)pm);
1812 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1813 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1814 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1816 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1817 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1818 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1820 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1821 &xmm_alpha_lo, &xmm_alpha_hi);
1822 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1823 &xmm_mask_lo, &xmm_mask_hi,
1824 &xmm_mask_lo, &xmm_mask_hi);
1826 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1827 &xmm_alpha_lo, &xmm_alpha_hi,
1828 &xmm_mask_lo, &xmm_mask_hi);
1831 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1845 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1850 static force_inline void
1851 core_combine_in_ca_sse2 (uint32_t * pd,
1858 __m128i xmm_alpha_lo, xmm_alpha_hi;
1859 __m128i xmm_src_lo, xmm_src_hi;
1860 __m128i xmm_dst_lo, xmm_dst_hi;
1861 __m128i xmm_mask_lo, xmm_mask_hi;
1863 /* call prefetch hint to optimize cache load*/
1864 cache_prefetch ((__m128i*)ps);
1865 cache_prefetch ((__m128i*)pd);
1866 cache_prefetch ((__m128i*)pm);
1868 while (w && (unsigned long)pd & 15)
1874 *pd++ = pack_1x64_32 (
1876 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1877 expand_alpha_1x64 (unpack_32_1x64 (d))));
1882 /* call prefetch hint to optimize cache load*/
1883 cache_prefetch ((__m128i*)ps);
1884 cache_prefetch ((__m128i*)pd);
1885 cache_prefetch ((__m128i*)pm);
1889 /* fill cache line with next memory */
1890 cache_prefetch_next ((__m128i*)ps);
1891 cache_prefetch_next ((__m128i*)pd);
1892 cache_prefetch_next ((__m128i*)pm);
1894 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1895 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1896 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1898 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1899 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1900 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1902 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1903 &xmm_alpha_lo, &xmm_alpha_hi);
1905 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1906 &xmm_mask_lo, &xmm_mask_hi,
1907 &xmm_dst_lo, &xmm_dst_hi);
1909 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1910 &xmm_alpha_lo, &xmm_alpha_hi,
1911 &xmm_dst_lo, &xmm_dst_hi);
1914 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1928 *pd++ = pack_1x64_32 (
1931 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1932 expand_alpha_1x64 (unpack_32_1x64 (d))));
1938 static force_inline void
1939 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1946 __m128i xmm_alpha_lo, xmm_alpha_hi;
1947 __m128i xmm_src_lo, xmm_src_hi;
1948 __m128i xmm_dst_lo, xmm_dst_hi;
1949 __m128i xmm_mask_lo, xmm_mask_hi;
1951 /* call prefetch hint to optimize cache load*/
1952 cache_prefetch ((__m128i*)ps);
1953 cache_prefetch ((__m128i*)pd);
1954 cache_prefetch ((__m128i*)pm);
1956 while (w && (unsigned long)pd & 15)
1962 *pd++ = pack_1x64_32 (
1965 pix_multiply_1x64 (unpack_32_1x64 (m),
1966 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1970 /* call prefetch hint to optimize cache load*/
1971 cache_prefetch ((__m128i*)ps);
1972 cache_prefetch ((__m128i*)pd);
1973 cache_prefetch ((__m128i*)pm);
1977 /* fill cache line with next memory */
1978 cache_prefetch_next ((__m128i*)ps);
1979 cache_prefetch_next ((__m128i*)pd);
1980 cache_prefetch_next ((__m128i*)pm);
1982 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1983 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1984 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1986 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1987 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1988 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1990 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1991 &xmm_alpha_lo, &xmm_alpha_hi);
1992 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1993 &xmm_alpha_lo, &xmm_alpha_hi,
1994 &xmm_alpha_lo, &xmm_alpha_hi);
1996 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1997 &xmm_alpha_lo, &xmm_alpha_hi,
1998 &xmm_dst_lo, &xmm_dst_hi);
2001 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2015 *pd++ = pack_1x64_32 (
2018 pix_multiply_1x64 (unpack_32_1x64 (m),
2019 expand_alpha_1x64 (unpack_32_1x64 (s)))));
2024 static force_inline void
2025 core_combine_out_ca_sse2 (uint32_t * pd,
2032 __m128i xmm_alpha_lo, xmm_alpha_hi;
2033 __m128i xmm_src_lo, xmm_src_hi;
2034 __m128i xmm_dst_lo, xmm_dst_hi;
2035 __m128i xmm_mask_lo, xmm_mask_hi;
2037 /* call prefetch hint to optimize cache load*/
2038 cache_prefetch ((__m128i*)ps);
2039 cache_prefetch ((__m128i*)pd);
2040 cache_prefetch ((__m128i*)pm);
2042 while (w && (unsigned long)pd & 15)
2048 *pd++ = pack_1x64_32 (
2051 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2052 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2056 /* call prefetch hint to optimize cache load*/
2057 cache_prefetch ((__m128i*)ps);
2058 cache_prefetch ((__m128i*)pd);
2059 cache_prefetch ((__m128i*)pm);
2063 /* fill cache line with next memory */
2064 cache_prefetch_next ((__m128i*)ps);
2065 cache_prefetch_next ((__m128i*)pd);
2066 cache_prefetch_next ((__m128i*)pm);
2068 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2069 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2070 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2072 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2073 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2074 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2076 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2077 &xmm_alpha_lo, &xmm_alpha_hi);
2078 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2079 &xmm_alpha_lo, &xmm_alpha_hi);
2081 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2082 &xmm_mask_lo, &xmm_mask_hi,
2083 &xmm_dst_lo, &xmm_dst_hi);
2084 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2085 &xmm_alpha_lo, &xmm_alpha_hi,
2086 &xmm_dst_lo, &xmm_dst_hi);
2089 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2103 *pd++ = pack_1x64_32 (
2106 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2107 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2113 static force_inline void
2114 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
2121 __m128i xmm_alpha_lo, xmm_alpha_hi;
2122 __m128i xmm_src_lo, xmm_src_hi;
2123 __m128i xmm_dst_lo, xmm_dst_hi;
2124 __m128i xmm_mask_lo, xmm_mask_hi;
2126 /* call prefetch hint to optimize cache load*/
2127 cache_prefetch ((__m128i*)ps);
2128 cache_prefetch ((__m128i*)pd);
2129 cache_prefetch ((__m128i*)pm);
2131 while (w && (unsigned long)pd & 15)
2137 *pd++ = pack_1x64_32 (
2140 negate_1x64 (pix_multiply_1x64 (
2142 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2146 /* call prefetch hint to optimize cache load*/
2147 cache_prefetch ((__m128i*)ps);
2148 cache_prefetch ((__m128i*)pd);
2149 cache_prefetch ((__m128i*)pm);
2153 /* fill cache line with next memory */
2154 cache_prefetch_next ((__m128i*)ps);
2155 cache_prefetch_next ((__m128i*)pd);
2156 cache_prefetch_next ((__m128i*)pm);
2158 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2159 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2160 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2162 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2163 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2164 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2166 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2167 &xmm_alpha_lo, &xmm_alpha_hi);
2169 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2170 &xmm_alpha_lo, &xmm_alpha_hi,
2171 &xmm_mask_lo, &xmm_mask_hi);
2173 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2174 &xmm_mask_lo, &xmm_mask_hi);
2176 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2177 &xmm_mask_lo, &xmm_mask_hi,
2178 &xmm_dst_lo, &xmm_dst_hi);
2181 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2195 *pd++ = pack_1x64_32 (
2198 negate_1x64 (pix_multiply_1x64 (
2200 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2205 static force_inline uint32_t
2206 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2210 __m64 m = unpack_32_1x64 (mask);
2211 __m64 s = unpack_32_1x64 (src);
2212 __m64 d = unpack_32_1x64 (dst);
2213 __m64 sa = expand_alpha_1x64 (s);
2214 __m64 da = expand_alpha_1x64 (d);
2216 s = pix_multiply_1x64 (s, m);
2217 m = negate_1x64 (pix_multiply_1x64 (m, sa));
2219 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2222 static force_inline void
2223 core_combine_atop_ca_sse2 (uint32_t * pd,
2230 __m128i xmm_src_lo, xmm_src_hi;
2231 __m128i xmm_dst_lo, xmm_dst_hi;
2232 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2233 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2234 __m128i xmm_mask_lo, xmm_mask_hi;
2236 /* call prefetch hint to optimize cache load*/
2237 cache_prefetch ((__m128i*)ps);
2238 cache_prefetch ((__m128i*)pd);
2239 cache_prefetch ((__m128i*)pm);
2241 while (w && (unsigned long)pd & 15)
2247 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2251 /* call prefetch hint to optimize cache load*/
2252 cache_prefetch ((__m128i*)ps);
2253 cache_prefetch ((__m128i*)pd);
2254 cache_prefetch ((__m128i*)pm);
2258 /* fill cache line with next memory */
2259 cache_prefetch_next ((__m128i*)ps);
2260 cache_prefetch_next ((__m128i*)pd);
2261 cache_prefetch_next ((__m128i*)pm);
2263 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2264 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2265 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2267 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2268 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2269 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2271 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2272 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2273 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2274 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2276 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2277 &xmm_mask_lo, &xmm_mask_hi,
2278 &xmm_src_lo, &xmm_src_hi);
2279 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2280 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2281 &xmm_mask_lo, &xmm_mask_hi);
2283 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2285 pix_add_multiply_2x128 (
2286 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2287 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2288 &xmm_dst_lo, &xmm_dst_hi);
2291 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2305 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2310 static force_inline uint32_t
2311 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2315 __m64 m = unpack_32_1x64 (mask);
2316 __m64 s = unpack_32_1x64 (src);
2317 __m64 d = unpack_32_1x64 (dst);
2319 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2320 __m64 sa = expand_alpha_1x64 (s);
2322 s = pix_multiply_1x64 (s, m);
2323 m = pix_multiply_1x64 (m, sa);
2325 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2328 static force_inline void
2329 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2336 __m128i xmm_src_lo, xmm_src_hi;
2337 __m128i xmm_dst_lo, xmm_dst_hi;
2338 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2339 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2340 __m128i xmm_mask_lo, xmm_mask_hi;
2342 /* call prefetch hint to optimize cache load*/
2343 cache_prefetch ((__m128i*)ps);
2344 cache_prefetch ((__m128i*)pd);
2345 cache_prefetch ((__m128i*)pm);
2347 while (w && (unsigned long)pd & 15)
2353 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2357 /* call prefetch hint to optimize cache load*/
2358 cache_prefetch ((__m128i*)ps);
2359 cache_prefetch ((__m128i*)pd);
2360 cache_prefetch ((__m128i*)pm);
2364 /* fill cache line with next memory */
2365 cache_prefetch_next ((__m128i*)ps);
2366 cache_prefetch_next ((__m128i*)pd);
2367 cache_prefetch_next ((__m128i*)pm);
2369 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2370 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2371 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2373 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2374 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2375 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2377 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2378 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2379 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2380 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2382 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2383 &xmm_mask_lo, &xmm_mask_hi,
2384 &xmm_src_lo, &xmm_src_hi);
2385 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2386 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2387 &xmm_mask_lo, &xmm_mask_hi);
2389 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2390 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2392 pix_add_multiply_2x128 (
2393 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2394 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2395 &xmm_dst_lo, &xmm_dst_hi);
2398 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2412 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2417 static force_inline uint32_t
2418 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2422 __m64 a = unpack_32_1x64 (mask);
2423 __m64 s = unpack_32_1x64 (src);
2424 __m64 d = unpack_32_1x64 (dst);
2426 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2427 a, expand_alpha_1x64 (s)));
2428 __m64 dest = pix_multiply_1x64 (s, a);
2429 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2431 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2437 static force_inline void
2438 core_combine_xor_ca_sse2 (uint32_t * pd,
2445 __m128i xmm_src_lo, xmm_src_hi;
2446 __m128i xmm_dst_lo, xmm_dst_hi;
2447 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2448 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2449 __m128i xmm_mask_lo, xmm_mask_hi;
2451 /* call prefetch hint to optimize cache load*/
2452 cache_prefetch ((__m128i*)ps);
2453 cache_prefetch ((__m128i*)pd);
2454 cache_prefetch ((__m128i*)pm);
2456 while (w && (unsigned long)pd & 15)
2462 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2466 /* call prefetch hint to optimize cache load*/
2467 cache_prefetch ((__m128i*)ps);
2468 cache_prefetch ((__m128i*)pd);
2469 cache_prefetch ((__m128i*)pm);
2473 /* fill cache line with next memory */
2474 cache_prefetch_next ((__m128i*)ps);
2475 cache_prefetch_next ((__m128i*)pd);
2476 cache_prefetch_next ((__m128i*)pm);
2478 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2479 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2480 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2482 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2483 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2484 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2486 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2487 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2488 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2489 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2491 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2492 &xmm_mask_lo, &xmm_mask_hi,
2493 &xmm_src_lo, &xmm_src_hi);
2494 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2495 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2496 &xmm_mask_lo, &xmm_mask_hi);
2498 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2499 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2500 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2501 &xmm_mask_lo, &xmm_mask_hi);
2503 pix_add_multiply_2x128 (
2504 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2505 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2506 &xmm_dst_lo, &xmm_dst_hi);
2509 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2523 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2528 static force_inline void
2529 core_combine_add_ca_sse2 (uint32_t * pd,
2536 __m128i xmm_src_lo, xmm_src_hi;
2537 __m128i xmm_dst_lo, xmm_dst_hi;
2538 __m128i xmm_mask_lo, xmm_mask_hi;
2540 /* call prefetch hint to optimize cache load*/
2541 cache_prefetch ((__m128i*)ps);
2542 cache_prefetch ((__m128i*)pd);
2543 cache_prefetch ((__m128i*)pm);
2545 while (w && (unsigned long)pd & 15)
2551 *pd++ = pack_1x64_32 (
2552 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2553 unpack_32_1x64 (m)),
2554 unpack_32_1x64 (d)));
2558 /* call prefetch hint to optimize cache load*/
2559 cache_prefetch ((__m128i*)ps);
2560 cache_prefetch ((__m128i*)pd);
2561 cache_prefetch ((__m128i*)pm);
2565 /* fill cache line with next memory */
2566 cache_prefetch_next ((__m128i*)ps);
2567 cache_prefetch_next ((__m128i*)pd);
2568 cache_prefetch_next ((__m128i*)pm);
2570 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2571 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2572 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2574 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2575 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2576 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2578 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2579 &xmm_mask_lo, &xmm_mask_hi,
2580 &xmm_src_lo, &xmm_src_hi);
2583 (__m128i*)pd, pack_2x128_128 (
2584 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2585 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2599 *pd++ = pack_1x64_32 (
2600 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2601 unpack_32_1x64 (m)),
2602 unpack_32_1x64 (d)));
2607 /* ---------------------------------------------------
2608 * fb_compose_setup_sSE2
2610 static force_inline __m64
2611 create_mask_16_64 (uint16_t mask)
2613 return _mm_set1_pi16 (mask);
2616 static force_inline __m128i
2617 create_mask_16_128 (uint16_t mask)
2619 return _mm_set1_epi16 (mask);
2622 static force_inline __m64
2623 create_mask_2x32_64 (uint32_t mask0,
2626 return _mm_set_pi32 (mask0, mask1);
2629 static force_inline __m128i
2630 create_mask_2x32_128 (uint32_t mask0,
2633 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2636 /* SSE2 code patch for fbcompose.c */
2639 sse2_combine_over_u (pixman_implementation_t *imp,
2642 const uint32_t * src,
2643 const uint32_t * mask,
2646 core_combine_over_u_sse2 (dst, src, mask, width);
2651 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2654 const uint32_t * src,
2655 const uint32_t * mask,
2658 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2663 sse2_combine_in_u (pixman_implementation_t *imp,
2666 const uint32_t * src,
2667 const uint32_t * mask,
2670 core_combine_in_u_sse2 (dst, src, mask, width);
2675 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2678 const uint32_t * src,
2679 const uint32_t * mask,
2682 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2687 sse2_combine_out_u (pixman_implementation_t *imp,
2690 const uint32_t * src,
2691 const uint32_t * mask,
2694 core_combine_out_u_sse2 (dst, src, mask, width);
2699 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2702 const uint32_t * src,
2703 const uint32_t * mask,
2706 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2711 sse2_combine_atop_u (pixman_implementation_t *imp,
2714 const uint32_t * src,
2715 const uint32_t * mask,
2718 core_combine_atop_u_sse2 (dst, src, mask, width);
2723 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2726 const uint32_t * src,
2727 const uint32_t * mask,
2730 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2735 sse2_combine_xor_u (pixman_implementation_t *imp,
2738 const uint32_t * src,
2739 const uint32_t * mask,
2742 core_combine_xor_u_sse2 (dst, src, mask, width);
2747 sse2_combine_add_u (pixman_implementation_t *imp,
2750 const uint32_t * src,
2751 const uint32_t * mask,
2754 core_combine_add_u_sse2 (dst, src, mask, width);
2759 sse2_combine_saturate_u (pixman_implementation_t *imp,
2762 const uint32_t * src,
2763 const uint32_t * mask,
2766 core_combine_saturate_u_sse2 (dst, src, mask, width);
2771 sse2_combine_src_ca (pixman_implementation_t *imp,
2774 const uint32_t * src,
2775 const uint32_t * mask,
2778 core_combine_src_ca_sse2 (dst, src, mask, width);
2783 sse2_combine_over_ca (pixman_implementation_t *imp,
2786 const uint32_t * src,
2787 const uint32_t * mask,
2790 core_combine_over_ca_sse2 (dst, src, mask, width);
2795 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2798 const uint32_t * src,
2799 const uint32_t * mask,
2802 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2807 sse2_combine_in_ca (pixman_implementation_t *imp,
2810 const uint32_t * src,
2811 const uint32_t * mask,
2814 core_combine_in_ca_sse2 (dst, src, mask, width);
2819 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2822 const uint32_t * src,
2823 const uint32_t * mask,
2826 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2831 sse2_combine_out_ca (pixman_implementation_t *imp,
2834 const uint32_t * src,
2835 const uint32_t * mask,
2838 core_combine_out_ca_sse2 (dst, src, mask, width);
2843 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2846 const uint32_t * src,
2847 const uint32_t * mask,
2850 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2855 sse2_combine_atop_ca (pixman_implementation_t *imp,
2858 const uint32_t * src,
2859 const uint32_t * mask,
2862 core_combine_atop_ca_sse2 (dst, src, mask, width);
2867 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2870 const uint32_t * src,
2871 const uint32_t * mask,
2874 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2879 sse2_combine_xor_ca (pixman_implementation_t *imp,
2882 const uint32_t * src,
2883 const uint32_t * mask,
2886 core_combine_xor_ca_sse2 (dst, src, mask, width);
2891 sse2_combine_add_ca (pixman_implementation_t *imp,
2894 const uint32_t * src,
2895 const uint32_t * mask,
2898 core_combine_add_ca_sse2 (dst, src, mask, width);
2902 /* -------------------------------------------------------------------
2903 * composite_over_n_8888
2907 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2909 pixman_image_t * src_image,
2910 pixman_image_t * mask_image,
2911 pixman_image_t * dst_image,
2922 uint32_t *dst_line, *dst, d;
2925 __m128i xmm_src, xmm_alpha;
2926 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2928 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2933 PIXMAN_IMAGE_GET_LINE (
2934 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2936 xmm_src = expand_pixel_32_1x128 (src);
2937 xmm_alpha = expand_alpha_1x128 (xmm_src);
2943 /* call prefetch hint to optimize cache load*/
2944 cache_prefetch ((__m128i*)dst);
2946 dst_line += dst_stride;
2949 while (w && (unsigned long)dst & 15)
2952 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2953 _mm_movepi64_pi64 (xmm_alpha),
2954 unpack_32_1x64 (d)));
2958 cache_prefetch ((__m128i*)dst);
2962 /* fill cache line with next memory */
2963 cache_prefetch_next ((__m128i*)dst);
2965 xmm_dst = load_128_aligned ((__m128i*)dst);
2967 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2969 over_2x128 (&xmm_src, &xmm_src,
2970 &xmm_alpha, &xmm_alpha,
2971 &xmm_dst_lo, &xmm_dst_hi);
2973 /* rebuid the 4 pixel data and save*/
2975 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2984 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2985 _mm_movepi64_pi64 (xmm_alpha),
2986 unpack_32_1x64 (d)));
2994 /* ---------------------------------------------------------------------
2995 * composite_over_n_0565
2998 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3000 pixman_image_t * src_image,
3001 pixman_image_t * mask_image,
3002 pixman_image_t * dst_image,
3013 uint16_t *dst_line, *dst, d;
3016 __m128i xmm_src, xmm_alpha;
3017 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3019 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3024 PIXMAN_IMAGE_GET_LINE (
3025 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3027 xmm_src = expand_pixel_32_1x128 (src);
3028 xmm_alpha = expand_alpha_1x128 (xmm_src);
3034 /* call prefetch hint to optimize cache load*/
3035 cache_prefetch ((__m128i*)dst);
3037 dst_line += dst_stride;
3040 while (w && (unsigned long)dst & 15)
3044 *dst++ = pack_565_32_16 (
3045 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3046 _mm_movepi64_pi64 (xmm_alpha),
3047 expand565_16_1x64 (d))));
3051 /* call prefetch hint to optimize cache load*/
3052 cache_prefetch ((__m128i*)dst);
3056 /* fill cache line with next memory */
3057 cache_prefetch_next ((__m128i*)dst);
3059 xmm_dst = load_128_aligned ((__m128i*)dst);
3061 unpack_565_128_4x128 (xmm_dst,
3062 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3064 over_2x128 (&xmm_src, &xmm_src,
3065 &xmm_alpha, &xmm_alpha,
3066 &xmm_dst0, &xmm_dst1);
3067 over_2x128 (&xmm_src, &xmm_src,
3068 &xmm_alpha, &xmm_alpha,
3069 &xmm_dst2, &xmm_dst3);
3071 xmm_dst = pack_565_4x128_128 (
3072 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3074 save_128_aligned ((__m128i*)dst, xmm_dst);
3083 *dst++ = pack_565_32_16 (
3084 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3085 _mm_movepi64_pi64 (xmm_alpha),
3086 expand565_16_1x64 (d))));
3093 /* ---------------------------------------------------------------------------
3094 * composite_over_n_8888_8888_ca
3098 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3100 pixman_image_t * src_image,
3101 pixman_image_t * mask_image,
3102 pixman_image_t * dst_image,
3113 uint32_t *dst_line, d;
3114 uint32_t *mask_line, m;
3116 int dst_stride, mask_stride;
3118 __m128i xmm_src, xmm_alpha;
3119 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3120 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3122 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3124 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3129 PIXMAN_IMAGE_GET_LINE (
3130 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3131 PIXMAN_IMAGE_GET_LINE (
3132 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3134 xmm_src = _mm_unpacklo_epi8 (
3135 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3136 xmm_alpha = expand_alpha_1x128 (xmm_src);
3137 mmx_src = _mm_movepi64_pi64 (xmm_src);
3138 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3143 const uint32_t *pm = (uint32_t *)mask_line;
3144 uint32_t *pd = (uint32_t *)dst_line;
3146 dst_line += dst_stride;
3147 mask_line += mask_stride;
3149 /* call prefetch hint to optimize cache load*/
3150 cache_prefetch ((__m128i*)pd);
3151 cache_prefetch ((__m128i*)pm);
3153 while (w && (unsigned long)pd & 15)
3160 mmx_mask = unpack_32_1x64 (m);
3161 mmx_dest = unpack_32_1x64 (d);
3163 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3173 /* call prefetch hint to optimize cache load*/
3174 cache_prefetch ((__m128i*)pd);
3175 cache_prefetch ((__m128i*)pm);
3179 /* fill cache line with next memory */
3180 cache_prefetch_next ((__m128i*)pd);
3181 cache_prefetch_next ((__m128i*)pm);
3183 xmm_mask = load_128_unaligned ((__m128i*)pm);
3187 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3189 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3190 if (pack_cmp != 0xffff)
3192 xmm_dst = load_128_aligned ((__m128i*)pd);
3194 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3195 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3197 in_over_2x128 (&xmm_src, &xmm_src,
3198 &xmm_alpha, &xmm_alpha,
3199 &xmm_mask_lo, &xmm_mask_hi,
3200 &xmm_dst_lo, &xmm_dst_hi);
3203 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3218 mmx_mask = unpack_32_1x64 (m);
3219 mmx_dest = unpack_32_1x64 (d);
3221 *pd = pack_1x64_32 (
3222 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3233 /*---------------------------------------------------------------------
3234 * composite_over_8888_n_8888
3238 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3240 pixman_image_t * src_image,
3241 pixman_image_t * mask_image,
3242 pixman_image_t * dst_image,
3252 uint32_t *dst_line, *dst;
3253 uint32_t *src_line, *src;
3256 int dst_stride, src_stride;
3259 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3260 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3261 __m128i xmm_alpha_lo, xmm_alpha_hi;
3263 PIXMAN_IMAGE_GET_LINE (
3264 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3265 PIXMAN_IMAGE_GET_LINE (
3266 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3268 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3270 xmm_mask = create_mask_16_128 (mask >> 24);
3275 dst_line += dst_stride;
3277 src_line += src_stride;
3280 /* call prefetch hint to optimize cache load*/
3281 cache_prefetch ((__m128i*)dst);
3282 cache_prefetch ((__m128i*)src);
3284 while (w && (unsigned long)dst & 15)
3286 uint32_t s = *src++;
3289 __m64 ms = unpack_32_1x64 (s);
3290 __m64 alpha = expand_alpha_1x64 (ms);
3291 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3292 __m64 alpha_dst = unpack_32_1x64 (d);
3294 *dst++ = pack_1x64_32 (
3295 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3300 /* call prefetch hint to optimize cache load*/
3301 cache_prefetch ((__m128i*)dst);
3302 cache_prefetch ((__m128i*)src);
3306 /* fill cache line with next memory */
3307 cache_prefetch_next ((__m128i*)dst);
3308 cache_prefetch_next ((__m128i*)src);
3310 xmm_src = load_128_unaligned ((__m128i*)src);
3311 xmm_dst = load_128_aligned ((__m128i*)dst);
3313 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3314 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3315 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3316 &xmm_alpha_lo, &xmm_alpha_hi);
3318 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3319 &xmm_alpha_lo, &xmm_alpha_hi,
3320 &xmm_mask, &xmm_mask,
3321 &xmm_dst_lo, &xmm_dst_hi);
3324 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3333 uint32_t s = *src++;
3336 __m64 ms = unpack_32_1x64 (s);
3337 __m64 alpha = expand_alpha_1x64 (ms);
3338 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3339 __m64 dest = unpack_32_1x64 (d);
3341 *dst++ = pack_1x64_32 (
3342 in_over_1x64 (&ms, &alpha, &mask, &dest));
3351 /* ---------------------------------------------------------------------
3352 * composite_over_x888_n_8888
3355 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3357 pixman_image_t * src_image,
3358 pixman_image_t * mask_image,
3359 pixman_image_t * dst_image,
3369 uint32_t *dst_line, *dst;
3370 uint32_t *src_line, *src;
3372 int dst_stride, src_stride;
3375 __m128i xmm_mask, xmm_alpha;
3376 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3377 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3379 PIXMAN_IMAGE_GET_LINE (
3380 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3381 PIXMAN_IMAGE_GET_LINE (
3382 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3384 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3386 xmm_mask = create_mask_16_128 (mask >> 24);
3387 xmm_alpha = mask_00ff;
3392 dst_line += dst_stride;
3394 src_line += src_stride;
3397 /* call prefetch hint to optimize cache load*/
3398 cache_prefetch ((__m128i*)dst);
3399 cache_prefetch ((__m128i*)src);
3401 while (w && (unsigned long)dst & 15)
3403 uint32_t s = (*src++) | 0xff000000;
3406 __m64 src = unpack_32_1x64 (s);
3407 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3408 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3409 __m64 dest = unpack_32_1x64 (d);
3411 *dst++ = pack_1x64_32 (
3412 in_over_1x64 (&src, &alpha, &mask, &dest));
3417 /* call prefetch hint to optimize cache load*/
3418 cache_prefetch ((__m128i*)dst);
3419 cache_prefetch ((__m128i*)src);
3423 /* fill cache line with next memory */
3424 cache_prefetch_next ((__m128i*)dst);
3425 cache_prefetch_next ((__m128i*)src);
3427 xmm_src = _mm_or_si128 (
3428 load_128_unaligned ((__m128i*)src), mask_ff000000);
3429 xmm_dst = load_128_aligned ((__m128i*)dst);
3431 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3432 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3434 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3435 &xmm_alpha, &xmm_alpha,
3436 &xmm_mask, &xmm_mask,
3437 &xmm_dst_lo, &xmm_dst_hi);
3440 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3450 uint32_t s = (*src++) | 0xff000000;
3453 __m64 src = unpack_32_1x64 (s);
3454 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3455 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3456 __m64 dest = unpack_32_1x64 (d);
3458 *dst++ = pack_1x64_32 (
3459 in_over_1x64 (&src, &alpha, &mask, &dest));
3468 /* --------------------------------------------------------------------
3469 * composite_over_8888_8888
3472 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3474 pixman_image_t * src_image,
3475 pixman_image_t * mask_image,
3476 pixman_image_t * dst_image,
3486 int dst_stride, src_stride;
3487 uint32_t *dst_line, *dst;
3488 uint32_t *src_line, *src;
3490 PIXMAN_IMAGE_GET_LINE (
3491 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3492 PIXMAN_IMAGE_GET_LINE (
3493 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3500 core_combine_over_u_sse2 (dst, src, NULL, width);
3508 /* ------------------------------------------------------------------
3509 * composite_over_8888_0565
3511 static force_inline uint16_t
3512 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3516 ms = unpack_32_1x64 (src);
3517 return pack_565_32_16 (
3520 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3524 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3526 pixman_image_t * src_image,
3527 pixman_image_t * mask_image,
3528 pixman_image_t * dst_image,
3538 uint16_t *dst_line, *dst, d;
3539 uint32_t *src_line, *src, s;
3540 int dst_stride, src_stride;
3543 __m128i xmm_alpha_lo, xmm_alpha_hi;
3544 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3545 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3547 PIXMAN_IMAGE_GET_LINE (
3548 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3549 PIXMAN_IMAGE_GET_LINE (
3550 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3555 * I copy the code from MMX one and keep the fixme.
3556 * If it's a problem there, probably is a problem here.
3558 assert (src_image->drawable == mask_image->drawable);
3566 /* call prefetch hint to optimize cache load*/
3567 cache_prefetch ((__m128i*)src);
3568 cache_prefetch ((__m128i*)dst);
3570 dst_line += dst_stride;
3571 src_line += src_stride;
3574 /* Align dst on a 16-byte boundary */
3576 ((unsigned long)dst & 15))
3581 *dst++ = composite_over_8888_0565pixel (s, d);
3585 /* call prefetch hint to optimize cache load*/
3586 cache_prefetch ((__m128i*)src);
3587 cache_prefetch ((__m128i*)dst);
3589 /* It's a 8 pixel loop */
3592 /* fill cache line with next memory */
3593 cache_prefetch_next ((__m128i*)src);
3594 cache_prefetch_next ((__m128i*)dst);
3596 /* I'm loading unaligned because I'm not sure
3597 * about the address alignment.
3599 xmm_src = load_128_unaligned ((__m128i*) src);
3600 xmm_dst = load_128_aligned ((__m128i*) dst);
3603 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3604 unpack_565_128_4x128 (xmm_dst,
3605 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3606 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3607 &xmm_alpha_lo, &xmm_alpha_hi);
3609 /* I'm loading next 4 pixels from memory
3610 * before to optimze the memory read.
3612 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3614 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3615 &xmm_alpha_lo, &xmm_alpha_hi,
3616 &xmm_dst0, &xmm_dst1);
3619 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3620 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3621 &xmm_alpha_lo, &xmm_alpha_hi);
3623 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3624 &xmm_alpha_lo, &xmm_alpha_hi,
3625 &xmm_dst2, &xmm_dst3);
3628 (__m128i*)dst, pack_565_4x128_128 (
3629 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3641 *dst++ = composite_over_8888_0565pixel (s, d);
3648 /* -----------------------------------------------------------------
3649 * composite_over_n_8_8888
3653 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3655 pixman_image_t * src_image,
3656 pixman_image_t * mask_image,
3657 pixman_image_t * dst_image,
3668 uint32_t *dst_line, *dst;
3669 uint8_t *mask_line, *mask;
3670 int dst_stride, mask_stride;
3674 __m128i xmm_src, xmm_alpha, xmm_def;
3675 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3676 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3678 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3680 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3686 PIXMAN_IMAGE_GET_LINE (
3687 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3688 PIXMAN_IMAGE_GET_LINE (
3689 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3691 xmm_def = create_mask_2x32_128 (src, src);
3692 xmm_src = expand_pixel_32_1x128 (src);
3693 xmm_alpha = expand_alpha_1x128 (xmm_src);
3694 mmx_src = _mm_movepi64_pi64 (xmm_src);
3695 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3700 dst_line += dst_stride;
3702 mask_line += mask_stride;
3705 /* call prefetch hint to optimize cache load*/
3706 cache_prefetch ((__m128i*)mask);
3707 cache_prefetch ((__m128i*)dst);
3709 while (w && (unsigned long)dst & 15)
3711 uint8_t m = *mask++;
3716 mmx_mask = expand_pixel_8_1x64 (m);
3717 mmx_dest = unpack_32_1x64 (d);
3719 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3729 /* call prefetch hint to optimize cache load*/
3730 cache_prefetch ((__m128i*)mask);
3731 cache_prefetch ((__m128i*)dst);
3735 /* fill cache line with next memory */
3736 cache_prefetch_next ((__m128i*)mask);
3737 cache_prefetch_next ((__m128i*)dst);
3739 m = *((uint32_t*)mask);
3741 if (srca == 0xff && m == 0xffffffff)
3743 save_128_aligned ((__m128i*)dst, xmm_def);
3747 xmm_dst = load_128_aligned ((__m128i*) dst);
3748 xmm_mask = unpack_32_1x128 (m);
3749 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3752 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3753 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3755 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3756 &xmm_mask_lo, &xmm_mask_hi);
3758 in_over_2x128 (&xmm_src, &xmm_src,
3759 &xmm_alpha, &xmm_alpha,
3760 &xmm_mask_lo, &xmm_mask_hi,
3761 &xmm_dst_lo, &xmm_dst_hi);
3764 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3774 uint8_t m = *mask++;
3779 mmx_mask = expand_pixel_8_1x64 (m);
3780 mmx_dest = unpack_32_1x64 (d);
3782 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3796 /* ----------------------------------------------------------------
3797 * composite_over_n_8_8888
3801 pixman_fill_sse2 (uint32_t *bits,
3810 uint32_t byte_width;
3815 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3818 if (bpp != 16 && bpp != 32)
3823 stride = stride * (int) sizeof (uint32_t) / 2;
3824 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3825 byte_width = 2 * width;
3830 stride = stride * (int) sizeof (uint32_t) / 4;
3831 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3832 byte_width = 4 * width;
3836 cache_prefetch ((__m128i*)byte_line);
3837 xmm_def = create_mask_2x32_128 (data, data);
3842 uint8_t *d = byte_line;
3843 byte_line += stride;
3847 cache_prefetch_next ((__m128i*)d);
3849 while (w >= 2 && ((unsigned long)d & 3))
3851 *(uint16_t *)d = data;
3856 while (w >= 4 && ((unsigned long)d & 15))
3858 *(uint32_t *)d = data;
3864 cache_prefetch_next ((__m128i*)d);
3868 cache_prefetch (((__m128i*)d) + 12);
3870 save_128_aligned ((__m128i*)(d), xmm_def);
3871 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3872 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3873 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3874 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3875 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3876 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3877 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3885 cache_prefetch (((__m128i*)d) + 8);
3887 save_128_aligned ((__m128i*)(d), xmm_def);
3888 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3889 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3890 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3896 cache_prefetch_next ((__m128i*)d);
3900 save_128_aligned ((__m128i*)(d), xmm_def);
3901 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3909 save_128_aligned ((__m128i*)(d), xmm_def);
3915 cache_prefetch_next ((__m128i*)d);
3919 *(uint32_t *)d = data;
3927 *(uint16_t *)d = data;
3938 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3940 pixman_image_t * src_image,
3941 pixman_image_t * mask_image,
3942 pixman_image_t * dst_image,
3953 uint32_t *dst_line, *dst;
3954 uint8_t *mask_line, *mask;
3955 int dst_stride, mask_stride;
3959 __m128i xmm_src, xmm_def;
3960 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3962 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3967 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3968 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3969 dest_x, dest_y, width, height, 0);
3973 PIXMAN_IMAGE_GET_LINE (
3974 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3975 PIXMAN_IMAGE_GET_LINE (
3976 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3978 xmm_def = create_mask_2x32_128 (src, src);
3979 xmm_src = expand_pixel_32_1x128 (src);
3984 dst_line += dst_stride;
3986 mask_line += mask_stride;
3989 /* call prefetch hint to optimize cache load*/
3990 cache_prefetch ((__m128i*)mask);
3991 cache_prefetch ((__m128i*)dst);
3993 while (w && (unsigned long)dst & 15)
3995 uint8_t m = *mask++;
3999 *dst = pack_1x64_32 (
4001 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4012 /* call prefetch hint to optimize cache load*/
4013 cache_prefetch ((__m128i*)mask);
4014 cache_prefetch ((__m128i*)dst);
4018 /* fill cache line with next memory */
4019 cache_prefetch_next ((__m128i*)mask);
4020 cache_prefetch_next ((__m128i*)dst);
4022 m = *((uint32_t*)mask);
4024 if (srca == 0xff && m == 0xffffffff)
4026 save_128_aligned ((__m128i*)dst, xmm_def);
4030 xmm_mask = unpack_32_1x128 (m);
4031 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4034 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4036 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4037 &xmm_mask_lo, &xmm_mask_hi);
4039 pix_multiply_2x128 (&xmm_src, &xmm_src,
4040 &xmm_mask_lo, &xmm_mask_hi,
4041 &xmm_mask_lo, &xmm_mask_hi);
4044 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4048 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4058 uint8_t m = *mask++;
4062 *dst = pack_1x64_32 (
4064 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4079 /*-----------------------------------------------------------------------
4080 * composite_over_n_8_0565
4084 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4086 pixman_image_t * src_image,
4087 pixman_image_t * mask_image,
4088 pixman_image_t * dst_image,
4099 uint16_t *dst_line, *dst, d;
4100 uint8_t *mask_line, *mask;
4101 int dst_stride, mask_stride;
4104 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4106 __m128i xmm_src, xmm_alpha;
4107 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4108 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4110 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4116 PIXMAN_IMAGE_GET_LINE (
4117 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4118 PIXMAN_IMAGE_GET_LINE (
4119 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4121 xmm_src = expand_pixel_32_1x128 (src);
4122 xmm_alpha = expand_alpha_1x128 (xmm_src);
4123 mmx_src = _mm_movepi64_pi64 (xmm_src);
4124 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4129 dst_line += dst_stride;
4131 mask_line += mask_stride;
4134 /* call prefetch hint to optimize cache load*/
4135 cache_prefetch ((__m128i*)mask);
4136 cache_prefetch ((__m128i*)dst);
4138 while (w && (unsigned long)dst & 15)
4145 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4146 mmx_dest = expand565_16_1x64 (d);
4148 *dst = pack_565_32_16 (
4151 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4158 /* call prefetch hint to optimize cache load*/
4159 cache_prefetch ((__m128i*)mask);
4160 cache_prefetch ((__m128i*)dst);
4164 /* fill cache line with next memory */
4165 cache_prefetch_next ((__m128i*)mask);
4166 cache_prefetch_next ((__m128i*)dst);
4168 xmm_dst = load_128_aligned ((__m128i*) dst);
4169 unpack_565_128_4x128 (xmm_dst,
4170 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4172 m = *((uint32_t*)mask);
4177 xmm_mask = unpack_32_1x128 (m);
4178 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4181 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4183 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4184 &xmm_mask_lo, &xmm_mask_hi);
4186 in_over_2x128 (&xmm_src, &xmm_src,
4187 &xmm_alpha, &xmm_alpha,
4188 &xmm_mask_lo, &xmm_mask_hi,
4189 &xmm_dst0, &xmm_dst1);
4192 m = *((uint32_t*)mask);
4197 xmm_mask = unpack_32_1x128 (m);
4198 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4201 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4203 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4204 &xmm_mask_lo, &xmm_mask_hi);
4205 in_over_2x128 (&xmm_src, &xmm_src,
4206 &xmm_alpha, &xmm_alpha,
4207 &xmm_mask_lo, &xmm_mask_hi,
4208 &xmm_dst2, &xmm_dst3);
4212 (__m128i*)dst, pack_565_4x128_128 (
4213 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4226 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4227 mmx_dest = expand565_16_1x64 (d);
4229 *dst = pack_565_32_16 (
4232 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4243 /* -----------------------------------------------------------------------
4244 * composite_over_pixbuf_0565
4248 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4250 pixman_image_t * src_image,
4251 pixman_image_t * mask_image,
4252 pixman_image_t * dst_image,
4262 uint16_t *dst_line, *dst, d;
4263 uint32_t *src_line, *src, s;
4264 int dst_stride, src_stride;
4266 uint32_t opaque, zero;
4269 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4270 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4272 PIXMAN_IMAGE_GET_LINE (
4273 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4274 PIXMAN_IMAGE_GET_LINE (
4275 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4280 * I copy the code from MMX one and keep the fixme.
4281 * If it's a problem there, probably is a problem here.
4283 assert (src_image->drawable == mask_image->drawable);
4289 dst_line += dst_stride;
4291 src_line += src_stride;
4294 /* call prefetch hint to optimize cache load*/
4295 cache_prefetch ((__m128i*)src);
4296 cache_prefetch ((__m128i*)dst);
4298 while (w && (unsigned long)dst & 15)
4303 ms = unpack_32_1x64 (s);
4305 *dst++ = pack_565_32_16 (
4307 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4311 /* call prefetch hint to optimize cache load*/
4312 cache_prefetch ((__m128i*)src);
4313 cache_prefetch ((__m128i*)dst);
4317 /* fill cache line with next memory */
4318 cache_prefetch_next ((__m128i*)src);
4319 cache_prefetch_next ((__m128i*)dst);
4322 xmm_src = load_128_unaligned ((__m128i*)src);
4323 xmm_dst = load_128_aligned ((__m128i*)dst);
4325 opaque = is_opaque (xmm_src);
4326 zero = is_zero (xmm_src);
4328 unpack_565_128_4x128 (xmm_dst,
4329 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4330 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4332 /* preload next round*/
4333 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4337 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4338 &xmm_dst0, &xmm_dst1);
4342 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4343 &xmm_dst0, &xmm_dst1);
4347 opaque = is_opaque (xmm_src);
4348 zero = is_zero (xmm_src);
4350 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4354 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4355 &xmm_dst2, &xmm_dst3);
4359 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4360 &xmm_dst2, &xmm_dst3);
4364 (__m128i*)dst, pack_565_4x128_128 (
4365 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4377 ms = unpack_32_1x64 (s);
4379 *dst++ = pack_565_32_16 (
4381 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4389 /* -------------------------------------------------------------------------
4390 * composite_over_pixbuf_8888
4394 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4396 pixman_image_t * src_image,
4397 pixman_image_t * mask_image,
4398 pixman_image_t * dst_image,
4408 uint32_t *dst_line, *dst, d;
4409 uint32_t *src_line, *src, s;
4410 int dst_stride, src_stride;
4412 uint32_t opaque, zero;
4414 __m128i xmm_src_lo, xmm_src_hi;
4415 __m128i xmm_dst_lo, xmm_dst_hi;
4417 PIXMAN_IMAGE_GET_LINE (
4418 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4419 PIXMAN_IMAGE_GET_LINE (
4420 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4425 * I copy the code from MMX one and keep the fixme.
4426 * If it's a problem there, probably is a problem here.
4428 assert (src_image->drawable == mask_image->drawable);
4434 dst_line += dst_stride;
4436 src_line += src_stride;
4439 /* call prefetch hint to optimize cache load*/
4440 cache_prefetch ((__m128i*)src);
4441 cache_prefetch ((__m128i*)dst);
4443 while (w && (unsigned long)dst & 15)
4448 *dst++ = pack_1x64_32 (
4449 over_rev_non_pre_1x64 (
4450 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4455 /* call prefetch hint to optimize cache load*/
4456 cache_prefetch ((__m128i*)src);
4457 cache_prefetch ((__m128i*)dst);
4461 /* fill cache line with next memory */
4462 cache_prefetch_next ((__m128i*)src);
4463 cache_prefetch_next ((__m128i*)dst);
4465 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4467 opaque = is_opaque (xmm_src_hi);
4468 zero = is_zero (xmm_src_hi);
4470 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4474 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4475 &xmm_dst_lo, &xmm_dst_hi);
4478 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4482 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4484 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4486 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4487 &xmm_dst_lo, &xmm_dst_hi);
4490 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4503 *dst++ = pack_1x64_32 (
4504 over_rev_non_pre_1x64 (
4505 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4514 /* -------------------------------------------------------------------------------------------------
4515 * composite_over_n_8888_0565_ca
4519 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4521 pixman_image_t * src_image,
4522 pixman_image_t * mask_image,
4523 pixman_image_t * dst_image,
4534 uint16_t *dst_line, *dst, d;
4535 uint32_t *mask_line, *mask, m;
4536 int dst_stride, mask_stride;
4540 __m128i xmm_src, xmm_alpha;
4541 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4542 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4544 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4546 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4551 PIXMAN_IMAGE_GET_LINE (
4552 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4553 PIXMAN_IMAGE_GET_LINE (
4554 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4556 xmm_src = expand_pixel_32_1x128 (src);
4557 xmm_alpha = expand_alpha_1x128 (xmm_src);
4558 mmx_src = _mm_movepi64_pi64 (xmm_src);
4559 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4566 mask_line += mask_stride;
4567 dst_line += dst_stride;
4569 /* call prefetch hint to optimize cache load*/
4570 cache_prefetch ((__m128i*)mask);
4571 cache_prefetch ((__m128i*)dst);
4573 while (w && ((unsigned long)dst & 15))
4575 m = *(uint32_t *) mask;
4580 mmx_mask = unpack_32_1x64 (m);
4581 mmx_dest = expand565_16_1x64 (d);
4583 *dst = pack_565_32_16 (
4586 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4594 /* call prefetch hint to optimize cache load*/
4595 cache_prefetch ((__m128i*)mask);
4596 cache_prefetch ((__m128i*)dst);
4600 /* fill cache line with next memory */
4601 cache_prefetch_next ((__m128i*)mask);
4602 cache_prefetch_next ((__m128i*)dst);
4605 xmm_mask = load_128_unaligned ((__m128i*)mask);
4606 xmm_dst = load_128_aligned ((__m128i*)dst);
4608 pack_cmp = _mm_movemask_epi8 (
4609 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4611 unpack_565_128_4x128 (xmm_dst,
4612 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4613 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4615 /* preload next round */
4616 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4618 /* preload next round */
4619 if (pack_cmp != 0xffff)
4621 in_over_2x128 (&xmm_src, &xmm_src,
4622 &xmm_alpha, &xmm_alpha,
4623 &xmm_mask_lo, &xmm_mask_hi,
4624 &xmm_dst0, &xmm_dst1);
4628 pack_cmp = _mm_movemask_epi8 (
4629 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4631 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4633 if (pack_cmp != 0xffff)
4635 in_over_2x128 (&xmm_src, &xmm_src,
4636 &xmm_alpha, &xmm_alpha,
4637 &xmm_mask_lo, &xmm_mask_hi,
4638 &xmm_dst2, &xmm_dst3);
4642 (__m128i*)dst, pack_565_4x128_128 (
4643 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4652 m = *(uint32_t *) mask;
4657 mmx_mask = unpack_32_1x64 (m);
4658 mmx_dest = expand565_16_1x64 (d);
4660 *dst = pack_565_32_16 (
4663 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4675 /* -----------------------------------------------------------------------
4676 * composite_in_n_8_8
4680 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4682 pixman_image_t * src_image,
4683 pixman_image_t * mask_image,
4684 pixman_image_t * dst_image,
4694 uint8_t *dst_line, *dst;
4695 uint8_t *mask_line, *mask;
4696 int dst_stride, mask_stride;
4702 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4703 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4705 PIXMAN_IMAGE_GET_LINE (
4706 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4707 PIXMAN_IMAGE_GET_LINE (
4708 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4710 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4716 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4721 dst_line += dst_stride;
4723 mask_line += mask_stride;
4726 /* call prefetch hint to optimize cache load*/
4727 cache_prefetch ((__m128i*)mask);
4728 cache_prefetch ((__m128i*)dst);
4730 while (w && ((unsigned long)dst & 15))
4732 m = (uint32_t) *mask++;
4733 d = (uint32_t) *dst;
4735 *dst++ = (uint8_t) pack_1x64_32 (
4737 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4738 unpack_32_1x64 (m)),
4739 unpack_32_1x64 (d)));
4743 /* call prefetch hint to optimize cache load*/
4744 cache_prefetch ((__m128i*)mask);
4745 cache_prefetch ((__m128i*)dst);
4749 /* fill cache line with next memory */
4750 cache_prefetch_next ((__m128i*)mask);
4751 cache_prefetch_next ((__m128i*)dst);
4753 xmm_mask = load_128_unaligned ((__m128i*)mask);
4754 xmm_dst = load_128_aligned ((__m128i*)dst);
4756 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4757 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4759 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4760 &xmm_mask_lo, &xmm_mask_hi,
4761 &xmm_mask_lo, &xmm_mask_hi);
4763 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4764 &xmm_dst_lo, &xmm_dst_hi,
4765 &xmm_dst_lo, &xmm_dst_hi);
4768 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4777 m = (uint32_t) *mask++;
4778 d = (uint32_t) *dst;
4780 *dst++ = (uint8_t) pack_1x64_32 (
4783 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4784 unpack_32_1x64 (d)));
4792 /* ---------------------------------------------------------------------------
4797 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4799 pixman_image_t * src_image,
4800 pixman_image_t * mask_image,
4801 pixman_image_t * dst_image,
4811 uint8_t *dst_line, *dst;
4812 uint8_t *src_line, *src;
4813 int src_stride, dst_stride;
4817 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4818 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4820 PIXMAN_IMAGE_GET_LINE (
4821 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4822 PIXMAN_IMAGE_GET_LINE (
4823 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4828 dst_line += dst_stride;
4830 src_line += src_stride;
4833 /* call prefetch hint to optimize cache load*/
4834 cache_prefetch ((__m128i*)src);
4835 cache_prefetch ((__m128i*)dst);
4837 while (w && ((unsigned long)dst & 15))
4839 s = (uint32_t) *src++;
4840 d = (uint32_t) *dst;
4842 *dst++ = (uint8_t) pack_1x64_32 (
4844 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4848 /* call prefetch hint to optimize cache load*/
4849 cache_prefetch ((__m128i*)src);
4850 cache_prefetch ((__m128i*)dst);
4854 /* fill cache line with next memory */
4855 cache_prefetch_next ((__m128i*)src);
4856 cache_prefetch_next ((__m128i*)dst);
4858 xmm_src = load_128_unaligned ((__m128i*)src);
4859 xmm_dst = load_128_aligned ((__m128i*)dst);
4861 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4862 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4864 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4865 &xmm_dst_lo, &xmm_dst_hi,
4866 &xmm_dst_lo, &xmm_dst_hi);
4869 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4878 s = (uint32_t) *src++;
4879 d = (uint32_t) *dst;
4881 *dst++ = (uint8_t) pack_1x64_32 (
4882 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4890 /* -------------------------------------------------------------------------
4891 * composite_add_8888_8_8
4895 sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
4897 pixman_image_t * src_image,
4898 pixman_image_t * mask_image,
4899 pixman_image_t * dst_image,
4909 uint8_t *dst_line, *dst;
4910 uint8_t *mask_line, *mask;
4911 int dst_stride, mask_stride;
4918 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4919 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4921 PIXMAN_IMAGE_GET_LINE (
4922 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4923 PIXMAN_IMAGE_GET_LINE (
4924 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4926 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4932 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4937 dst_line += dst_stride;
4939 mask_line += mask_stride;
4942 /* call prefetch hint to optimize cache load*/
4943 cache_prefetch ((__m128i*)mask);
4944 cache_prefetch ((__m128i*)dst);
4946 while (w && ((unsigned long)dst & 15))
4948 m = (uint32_t) *mask++;
4949 d = (uint32_t) *dst;
4951 *dst++ = (uint8_t) pack_1x64_32 (
4954 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4955 unpack_32_1x64 (d)));
4959 /* call prefetch hint to optimize cache load*/
4960 cache_prefetch ((__m128i*)mask);
4961 cache_prefetch ((__m128i*)dst);
4965 /* fill cache line with next memory */
4966 cache_prefetch_next ((__m128i*)mask);
4967 cache_prefetch_next ((__m128i*)dst);
4969 xmm_mask = load_128_unaligned ((__m128i*)mask);
4970 xmm_dst = load_128_aligned ((__m128i*)dst);
4972 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4973 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4975 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4976 &xmm_mask_lo, &xmm_mask_hi,
4977 &xmm_mask_lo, &xmm_mask_hi);
4979 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4980 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4983 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4992 m = (uint32_t) *mask++;
4993 d = (uint32_t) *dst;
4995 *dst++ = (uint8_t) pack_1x64_32 (
4998 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4999 unpack_32_1x64 (d)));
5008 /* ----------------------------------------------------------------------
5009 * composite_add_8000_8000
5013 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5015 pixman_image_t * src_image,
5016 pixman_image_t * mask_image,
5017 pixman_image_t * dst_image,
5027 uint8_t *dst_line, *dst;
5028 uint8_t *src_line, *src;
5029 int dst_stride, src_stride;
5033 PIXMAN_IMAGE_GET_LINE (
5034 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5035 PIXMAN_IMAGE_GET_LINE (
5036 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5043 /* call prefetch hint to optimize cache load*/
5044 cache_prefetch ((__m128i*)src);
5045 cache_prefetch ((__m128i*)dst);
5047 dst_line += dst_stride;
5048 src_line += src_stride;
5052 while (w && (unsigned long)dst & 3)
5054 t = (*dst) + (*src++);
5055 *dst++ = t | (0 - (t >> 8));
5059 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5069 t = (*dst) + (*src++);
5070 *dst++ = t | (0 - (t >> 8));
5078 /* ---------------------------------------------------------------------
5079 * composite_add_8888_8888
5082 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5084 pixman_image_t * src_image,
5085 pixman_image_t * mask_image,
5086 pixman_image_t * dst_image,
5096 uint32_t *dst_line, *dst;
5097 uint32_t *src_line, *src;
5098 int dst_stride, src_stride;
5100 PIXMAN_IMAGE_GET_LINE (
5101 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5102 PIXMAN_IMAGE_GET_LINE (
5103 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5108 dst_line += dst_stride;
5110 src_line += src_stride;
5112 core_combine_add_u_sse2 (dst, src, NULL, width);
5118 /* -------------------------------------------------------------------------------------------------
5119 * sse2_composite_copy_area
5122 static pixman_bool_t
5123 pixman_blt_sse2 (uint32_t *src_bits,
5136 uint8_t * src_bytes;
5137 uint8_t * dst_bytes;
5140 if (src_bpp != dst_bpp)
5145 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5146 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5147 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5148 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5149 byte_width = 2 * width;
5153 else if (src_bpp == 32)
5155 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5156 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5157 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5158 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5159 byte_width = 4 * width;
5168 cache_prefetch ((__m128i*)src_bytes);
5169 cache_prefetch ((__m128i*)dst_bytes);
5174 uint8_t *s = src_bytes;
5175 uint8_t *d = dst_bytes;
5176 src_bytes += src_stride;
5177 dst_bytes += dst_stride;
5180 cache_prefetch_next ((__m128i*)s);
5181 cache_prefetch_next ((__m128i*)d);
5183 while (w >= 2 && ((unsigned long)d & 3))
5185 *(uint16_t *)d = *(uint16_t *)s;
5191 while (w >= 4 && ((unsigned long)d & 15))
5193 *(uint32_t *)d = *(uint32_t *)s;
5200 cache_prefetch_next ((__m128i*)s);
5201 cache_prefetch_next ((__m128i*)d);
5205 __m128i xmm0, xmm1, xmm2, xmm3;
5207 /* 128 bytes ahead */
5208 cache_prefetch (((__m128i*)s) + 8);
5209 cache_prefetch (((__m128i*)d) + 8);
5211 xmm0 = load_128_unaligned ((__m128i*)(s));
5212 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5213 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5214 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5216 save_128_aligned ((__m128i*)(d), xmm0);
5217 save_128_aligned ((__m128i*)(d + 16), xmm1);
5218 save_128_aligned ((__m128i*)(d + 32), xmm2);
5219 save_128_aligned ((__m128i*)(d + 48), xmm3);
5226 cache_prefetch_next ((__m128i*)s);
5227 cache_prefetch_next ((__m128i*)d);
5231 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5238 cache_prefetch_next ((__m128i*)s);
5239 cache_prefetch_next ((__m128i*)d);
5243 *(uint32_t *)d = *(uint32_t *)s;
5252 *(uint16_t *)d = *(uint16_t *)s;
5265 sse2_composite_copy_area (pixman_implementation_t *imp,
5267 pixman_image_t * src_image,
5268 pixman_image_t * mask_image,
5269 pixman_image_t * dst_image,
5279 pixman_blt_sse2 (src_image->bits.bits,
5280 dst_image->bits.bits,
5281 src_image->bits.rowstride,
5282 dst_image->bits.rowstride,
5283 PIXMAN_FORMAT_BPP (src_image->bits.format),
5284 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5285 src_x, src_y, dest_x, dest_y, width, height);
5289 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
5291 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5293 pixman_image_t * src_image,
5294 pixman_image_t * mask_image,
5295 pixman_image_t * dst_image,
5305 uint32_t *src, *src_line, s;
5306 uint32_t *dst, *dst_line, d;
5307 uint8_t *mask, *mask_line;
5309 int src_stride, mask_stride, dst_stride;
5312 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5313 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5314 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5316 PIXMAN_IMAGE_GET_LINE (
5317 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5318 PIXMAN_IMAGE_GET_LINE (
5319 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5320 PIXMAN_IMAGE_GET_LINE (
5321 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5326 src_line += src_stride;
5328 dst_line += dst_stride;
5330 mask_line += mask_stride;
5334 /* call prefetch hint to optimize cache load*/
5335 cache_prefetch ((__m128i*)src);
5336 cache_prefetch ((__m128i*)dst);
5337 cache_prefetch ((__m128i*)mask);
5339 while (w && (unsigned long)dst & 15)
5341 s = 0xff000000 | *src++;
5342 m = (uint32_t) *mask++;
5345 __m64 ms = unpack_32_1x64 (s);
5349 ms = in_over_1x64 (ms,
5351 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5352 unpack_32_1x64 (d));
5355 *dst++ = pack_1x64_32 (ms);
5359 /* call prefetch hint to optimize cache load*/
5360 cache_prefetch ((__m128i*)src);
5361 cache_prefetch ((__m128i*)dst);
5362 cache_prefetch ((__m128i*)mask);
5366 /* fill cache line with next memory */
5367 cache_prefetch_next ((__m128i*)src);
5368 cache_prefetch_next ((__m128i*)dst);
5369 cache_prefetch_next ((__m128i*)mask);
5371 m = *(uint32_t*) mask;
5372 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5374 if (m == 0xffffffff)
5376 save_128_aligned ((__m128i*)dst, xmm_src);
5380 xmm_dst = load_128_aligned ((__m128i*)dst);
5382 xmm_mask = _mm_unpacklo_epi16 (
5383 unpack_32_1x128 (m), _mm_setzero_si128 ());
5385 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5386 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5387 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5389 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
5390 &xmm_mask_lo, &xmm_mask_hi);
5392 in_over_2x128 (xmm_src_lo, xmm_src_hi,
5393 mask_00ff, mask_00ff,
5394 xmm_mask_lo, xmm_mask_hi,
5395 &xmm_dst_lo, &xmm_dst_hi);
5398 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5409 m = (uint32_t) *mask++;
5413 s = 0xff000000 | *src;
5423 *dst = pack_1x64_32 (
5427 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5428 unpack_32_1x64 (d)));
5444 static const pixman_fast_path_t sse2_fast_paths[] =
5446 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 },
5447 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 },
5448 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 },
5449 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 },
5450 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 },
5451 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
5452 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
5453 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
5454 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
5455 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
5456 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
5457 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5458 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5459 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5460 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5462 /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
5463 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5464 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5465 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
5466 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5468 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5469 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5470 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5471 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5472 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5473 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5474 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5475 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5476 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5477 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5478 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5479 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5480 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5481 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5482 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5483 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5484 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5485 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5486 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5487 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5488 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5489 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5490 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5491 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5492 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5493 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5494 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5495 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5497 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 },
5498 { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 },
5499 { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 },
5500 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_8888_8_8, 0 },
5502 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5503 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5504 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5505 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5506 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 },
5507 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 },
5508 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5509 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5510 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5511 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5512 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 },
5513 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 },
5515 { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 },
5516 { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 },
5522 * Work around GCC bug causing crashes in Mozilla with SSE2
5524 * When using -msse, gcc generates movdqa instructions assuming that
5525 * the stack is 16 byte aligned. Unfortunately some applications, such
5526 * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
5527 * causes the movdqa instructions to fail.
5529 * The __force_align_arg_pointer__ makes gcc generate a prologue that
5530 * realigns the stack pointer to 16 bytes.
5532 * On x86-64 this is not necessary because the standard ABI already
5533 * calls for a 16 byte aligned stack.
5535 * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
5537 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5538 __attribute__((__force_align_arg_pointer__))
5541 sse2_composite (pixman_implementation_t *imp,
5543 pixman_image_t * src,
5544 pixman_image_t * mask,
5545 pixman_image_t * dest,
5555 if (_pixman_run_fast_path (sse2_fast_paths, imp,
5556 op, src, mask, dest,
5565 _pixman_implementation_composite (imp->delegate, op,
5573 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5574 __attribute__((__force_align_arg_pointer__))
5576 static pixman_bool_t
5577 sse2_blt (pixman_implementation_t *imp,
5578 uint32_t * src_bits,
5579 uint32_t * dst_bits,
5591 if (!pixman_blt_sse2 (
5592 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5593 src_x, src_y, dst_x, dst_y, width, height))
5596 return _pixman_implementation_blt (
5598 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5599 src_x, src_y, dst_x, dst_y, width, height);
5605 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5606 __attribute__((__force_align_arg_pointer__))
5608 static pixman_bool_t
5609 sse2_fill (pixman_implementation_t *imp,
5619 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5621 return _pixman_implementation_fill (
5622 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5628 pixman_implementation_t *
5629 _pixman_implementation_create_sse2 (void)
5631 pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5632 pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5634 /* SSE2 constants */
5635 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5636 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5637 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5638 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5639 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5640 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5641 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5642 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5643 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
5644 mask_0080 = create_mask_16_128 (0x0080);
5645 mask_00ff = create_mask_16_128 (0x00ff);
5646 mask_0101 = create_mask_16_128 (0x0101);
5647 mask_ffff = create_mask_16_128 (0xffff);
5648 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5649 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5652 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5653 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5655 mask_x0080 = create_mask_16_64 (0x0080);
5656 mask_x00ff = create_mask_16_64 (0x00ff);
5657 mask_x0101 = create_mask_16_64 (0x0101);
5658 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5662 /* Set up function pointers */
5664 /* SSE code patch for fbcompose.c */
5665 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5666 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5667 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5668 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5669 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5670 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5671 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5672 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5673 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5674 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5676 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5678 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5679 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5680 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5681 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5682 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5683 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5684 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5685 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5686 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5687 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5688 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5690 imp->composite = sse2_composite;
5691 imp->blt = sse2_blt;
5692 imp->fill = sse2_fill;
5697 #endif /* USE_SSE2 */