2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
41 /* --------------------------------------------------------------------
45 static __m64 mask_x0080;
46 static __m64 mask_x00ff;
47 static __m64 mask_x0101;
48 static __m64 mask_x_alpha;
50 static __m64 mask_x565_rgb;
51 static __m64 mask_x565_unpack;
53 static __m128i mask_0080;
54 static __m128i mask_00ff;
55 static __m128i mask_0101;
56 static __m128i mask_ffff;
57 static __m128i mask_ff000000;
58 static __m128i mask_alpha;
60 static __m128i mask_565_r;
61 static __m128i mask_565_g1, mask_565_g2;
62 static __m128i mask_565_b;
63 static __m128i mask_red;
64 static __m128i mask_green;
65 static __m128i mask_blue;
67 static __m128i mask_565_fix_rb;
68 static __m128i mask_565_fix_g;
70 /* ----------------------------------------------------------------------
73 static force_inline __m128i
74 unpack_32_1x128 (uint32_t data)
76 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
79 static force_inline void
80 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
82 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
83 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
86 static force_inline __m128i
87 unpack_565_to_8888 (__m128i lo)
89 __m128i r, g, b, rb, t;
91 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
92 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
93 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
95 rb = _mm_or_si128 (r, b);
96 t = _mm_and_si128 (rb, mask_565_fix_rb);
97 t = _mm_srli_epi32 (t, 5);
98 rb = _mm_or_si128 (rb, t);
100 t = _mm_and_si128 (g, mask_565_fix_g);
101 t = _mm_srli_epi32 (t, 6);
102 g = _mm_or_si128 (g, t);
104 return _mm_or_si128 (rb, g);
107 static force_inline void
108 unpack_565_128_4x128 (__m128i data,
116 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
117 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
119 lo = unpack_565_to_8888 (lo);
120 hi = unpack_565_to_8888 (hi);
122 unpack_128_2x128 (lo, data0, data1);
123 unpack_128_2x128 (hi, data2, data3);
126 static force_inline uint16_t
127 pack_565_32_16 (uint32_t pixel)
129 return (uint16_t) (((pixel >> 8) & 0xf800) |
130 ((pixel >> 5) & 0x07e0) |
131 ((pixel >> 3) & 0x001f));
134 static force_inline __m128i
135 pack_2x128_128 (__m128i lo, __m128i hi)
137 return _mm_packus_epi16 (lo, hi);
140 static force_inline __m128i
141 pack_565_2x128_128 (__m128i lo, __m128i hi)
144 __m128i r, g1, g2, b;
146 data = pack_2x128_128 (lo, hi);
148 r = _mm_and_si128 (data, mask_565_r);
149 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
150 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
151 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
153 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
156 static force_inline __m128i
157 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
159 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
160 pack_565_2x128_128 (*xmm2, *xmm3));
163 static force_inline int
164 is_opaque (__m128i x)
166 __m128i ffs = _mm_cmpeq_epi8 (x, x);
168 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
171 static force_inline int
174 return _mm_movemask_epi8 (
175 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
178 static force_inline int
179 is_transparent (__m128i x)
181 return (_mm_movemask_epi8 (
182 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
185 static force_inline __m128i
186 expand_pixel_32_1x128 (uint32_t data)
188 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
191 static force_inline __m128i
192 expand_alpha_1x128 (__m128i data)
194 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
195 _MM_SHUFFLE (3, 3, 3, 3)),
196 _MM_SHUFFLE (3, 3, 3, 3));
199 static force_inline void
200 expand_alpha_2x128 (__m128i data_lo,
207 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
208 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
210 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
211 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
214 static force_inline void
215 expand_alpha_rev_2x128 (__m128i data_lo,
222 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
223 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
224 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
225 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
228 static force_inline void
229 pix_multiply_2x128 (__m128i* data_lo,
238 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
239 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
240 lo = _mm_adds_epu16 (lo, mask_0080);
241 hi = _mm_adds_epu16 (hi, mask_0080);
242 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
243 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
246 static force_inline void
247 pix_add_multiply_2x128 (__m128i* src_lo,
249 __m128i* alpha_dst_lo,
250 __m128i* alpha_dst_hi,
253 __m128i* alpha_src_lo,
254 __m128i* alpha_src_hi,
259 __m128i mul_lo, mul_hi;
261 lo = _mm_mullo_epi16 (*src_lo, *alpha_dst_lo);
262 hi = _mm_mullo_epi16 (*src_hi, *alpha_dst_hi);
263 mul_lo = _mm_mullo_epi16 (*dst_lo, *alpha_src_lo);
264 mul_hi = _mm_mullo_epi16 (*dst_hi, *alpha_src_hi);
265 lo = _mm_adds_epu16 (lo, mask_0080);
266 hi = _mm_adds_epu16 (hi, mask_0080);
267 lo = _mm_adds_epu16 (lo, mul_lo);
268 hi = _mm_adds_epu16 (hi, mul_hi);
269 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
270 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
273 static force_inline void
274 negate_2x128 (__m128i data_lo,
279 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
280 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
283 static force_inline void
284 invert_colors_2x128 (__m128i data_lo,
291 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
292 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
293 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
294 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
297 static force_inline void
298 over_2x128 (__m128i* src_lo,
307 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
309 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
311 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
312 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
315 static force_inline void
316 over_rev_non_pre_2x128 (__m128i src_lo,
322 __m128i alpha_lo, alpha_hi;
324 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
326 lo = _mm_or_si128 (alpha_lo, mask_alpha);
327 hi = _mm_or_si128 (alpha_hi, mask_alpha);
329 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
331 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
333 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
336 static force_inline void
337 in_over_2x128 (__m128i* src_lo,
349 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
350 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
352 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
355 static force_inline void
356 cache_prefetch (__m128i* addr)
358 _mm_prefetch (addr, _MM_HINT_T0);
361 static force_inline void
362 cache_prefetch_next (__m128i* addr)
364 _mm_prefetch (addr + 4, _MM_HINT_T0); /* 64 bytes ahead */
367 /* load 4 pixels from a 16-byte boundary aligned address */
368 static force_inline __m128i
369 load_128_aligned (__m128i* src)
371 return _mm_load_si128 (src);
374 /* load 4 pixels from a unaligned address */
375 static force_inline __m128i
376 load_128_unaligned (const __m128i* src)
378 return _mm_loadu_si128 (src);
381 /* save 4 pixels using Write Combining memory on a 16-byte
382 * boundary aligned address
384 static force_inline void
385 save_128_write_combining (__m128i* dst,
388 _mm_stream_si128 (dst, data);
391 /* save 4 pixels on a 16-byte boundary aligned address */
392 static force_inline void
393 save_128_aligned (__m128i* dst,
396 _mm_store_si128 (dst, data);
399 /* save 4 pixels on a unaligned address */
400 static force_inline void
401 save_128_unaligned (__m128i* dst,
404 _mm_storeu_si128 (dst, data);
407 /* ------------------------------------------------------------------
411 static force_inline __m64
412 unpack_32_1x64 (uint32_t data)
414 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
417 static force_inline __m64
418 expand_alpha_1x64 (__m64 data)
420 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
423 static force_inline __m64
424 expand_alpha_rev_1x64 (__m64 data)
426 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
429 static force_inline __m64
430 expand_pixel_8_1x64 (uint8_t data)
432 return _mm_shuffle_pi16 (
433 unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
436 static force_inline __m64
437 pix_multiply_1x64 (__m64 data,
440 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
445 static force_inline __m64
446 pix_add_multiply_1x64 (__m64* src,
451 return _mm_mulhi_pu16 (
452 _mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alpha_dst),
454 _mm_mullo_pi16 (*dst, *alpha_src)),
458 static force_inline __m64
459 negate_1x64 (__m64 data)
461 return _mm_xor_si64 (data, mask_x00ff);
464 static force_inline __m64
465 invert_colors_1x64 (__m64 data)
467 return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
470 static force_inline __m64
471 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
473 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
476 static force_inline __m64
477 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
479 return over_1x64 (pix_multiply_1x64 (*src, *mask),
480 pix_multiply_1x64 (*alpha, *mask),
484 static force_inline __m64
485 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
487 __m64 alpha = expand_alpha_1x64 (src);
489 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
490 _mm_or_si64 (alpha, mask_x_alpha)),
495 static force_inline uint32_t
496 pack_1x64_32 (__m64 data)
498 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
501 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
505 * --- Expanding 565 in the low word ---
507 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
508 * m = m & (01f0003f001f);
509 * m = m * (008404100840);
512 * Note the trick here - the top word is shifted by another nibble to
513 * avoid it bumping into the middle word
515 static force_inline __m64
516 expand565_16_1x64 (uint16_t pixel)
521 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
523 t1 = _mm_slli_si64 (p, 36 - 11);
524 t2 = _mm_slli_si64 (p, 16 - 5);
526 p = _mm_or_si64 (t1, p);
527 p = _mm_or_si64 (t2, p);
528 p = _mm_and_si64 (p, mask_x565_rgb);
529 p = _mm_mullo_pi16 (p, mask_x565_unpack);
531 return _mm_srli_pi16 (p, 8);
534 /* ----------------------------------------------------------------------------
535 * Compose Core transformations
537 static force_inline uint32_t
538 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
551 ms = unpack_32_1x64 (src);
552 return pack_1x64_32 (
553 over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
559 static force_inline uint32_t
560 combine1 (const uint32_t *ps, const uint32_t *pm)
568 mm = unpack_32_1x64 (*pm);
569 mm = expand_alpha_1x64 (mm);
571 ms = unpack_32_1x64 (s);
572 ms = pix_multiply_1x64 (ms, mm);
574 s = pack_1x64_32 (ms);
580 static force_inline __m128i
581 combine4 (const __m128i *ps, const __m128i *pm)
583 __m128i xmm_src_lo, xmm_src_hi;
584 __m128i xmm_msk_lo, xmm_msk_hi;
589 xmm_msk_lo = load_128_unaligned (pm);
591 if (is_transparent (xmm_msk_lo))
592 return _mm_setzero_si128 ();
595 s = load_128_unaligned (ps);
599 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
600 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
602 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
604 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
605 &xmm_msk_lo, &xmm_msk_hi,
606 &xmm_src_lo, &xmm_src_hi);
608 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
614 static force_inline void
615 core_combine_over_u_sse2 (uint32_t* pd,
622 __m128i xmm_dst_lo, xmm_dst_hi;
623 __m128i xmm_src_lo, xmm_src_hi;
624 __m128i xmm_alpha_lo, xmm_alpha_hi;
626 /* call prefetch hint to optimize cache load*/
627 cache_prefetch ((__m128i*)ps);
628 cache_prefetch ((__m128i*)pd);
629 cache_prefetch ((__m128i*)pm);
631 /* Align dst on a 16-byte boundary */
632 while (w && ((unsigned long)pd & 15))
635 s = combine1 (ps, pm);
637 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
644 /* call prefetch hint to optimize cache load*/
645 cache_prefetch ((__m128i*)ps);
646 cache_prefetch ((__m128i*)pd);
647 cache_prefetch ((__m128i*)pm);
651 /* fill cache line with next memory */
652 cache_prefetch_next ((__m128i*)ps);
653 cache_prefetch_next ((__m128i*)pd);
654 cache_prefetch_next ((__m128i*)pm);
656 /* I'm loading unaligned because I'm not sure about
657 * the address alignment.
659 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
661 if (is_opaque (xmm_src_hi))
663 save_128_aligned ((__m128i*)pd, xmm_src_hi);
665 else if (!is_zero (xmm_src_hi))
667 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
669 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
670 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
673 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
675 over_2x128 (&xmm_src_lo, &xmm_src_hi,
676 &xmm_alpha_lo, &xmm_alpha_hi,
677 &xmm_dst_lo, &xmm_dst_hi);
679 /* rebuid the 4 pixel data and save*/
680 save_128_aligned ((__m128i*)pd,
681 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
694 s = combine1 (ps, pm);
696 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
705 static force_inline void
706 core_combine_over_reverse_u_sse2 (uint32_t* pd,
713 __m128i xmm_dst_lo, xmm_dst_hi;
714 __m128i xmm_src_lo, xmm_src_hi;
715 __m128i xmm_alpha_lo, xmm_alpha_hi;
717 /* call prefetch hint to optimize cache load*/
718 cache_prefetch ((__m128i*)ps);
719 cache_prefetch ((__m128i*)pd);
720 cache_prefetch ((__m128i*)pm);
722 /* Align dst on a 16-byte boundary */
724 ((unsigned long)pd & 15))
727 s = combine1 (ps, pm);
729 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
736 /* call prefetch hint to optimize cache load*/
737 cache_prefetch ((__m128i*)ps);
738 cache_prefetch ((__m128i*)pd);
739 cache_prefetch ((__m128i*)pm);
743 /* fill cache line with next memory */
744 cache_prefetch_next ((__m128i*)ps);
745 cache_prefetch_next ((__m128i*)pd);
746 cache_prefetch_next ((__m128i*)pm);
748 /* I'm loading unaligned because I'm not sure
749 * about the address alignment.
751 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
752 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
754 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
755 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
757 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
758 &xmm_alpha_lo, &xmm_alpha_hi);
760 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
761 &xmm_alpha_lo, &xmm_alpha_hi,
762 &xmm_src_lo, &xmm_src_hi);
764 /* rebuid the 4 pixel data and save*/
765 save_128_aligned ((__m128i*)pd,
766 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
779 s = combine1 (ps, pm);
781 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
789 static force_inline uint32_t
790 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
792 uint32_t maska = src >> 24;
798 else if (maska != 0xff)
800 return pack_1x64_32 (
801 pix_multiply_1x64 (unpack_32_1x64 (dst),
802 expand_alpha_1x64 (unpack_32_1x64 (src))));
808 static force_inline void
809 core_combine_in_u_sse2 (uint32_t* pd,
816 __m128i xmm_src_lo, xmm_src_hi;
817 __m128i xmm_dst_lo, xmm_dst_hi;
819 /* call prefetch hint to optimize cache load*/
820 cache_prefetch ((__m128i*)ps);
821 cache_prefetch ((__m128i*)pd);
822 cache_prefetch ((__m128i*)pm);
824 while (w && ((unsigned long) pd & 15))
826 s = combine1 (ps, pm);
829 *pd++ = core_combine_in_u_pixelsse2 (d, s);
836 /* call prefetch hint to optimize cache load*/
837 cache_prefetch ((__m128i*)ps);
838 cache_prefetch ((__m128i*)pd);
839 cache_prefetch ((__m128i*)pm);
843 /* fill cache line with next memory */
844 cache_prefetch_next ((__m128i*)ps);
845 cache_prefetch_next ((__m128i*)pd);
846 cache_prefetch_next ((__m128i*)pm);
848 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
849 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
851 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
852 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
854 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
855 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
856 &xmm_dst_lo, &xmm_dst_hi,
857 &xmm_dst_lo, &xmm_dst_hi);
859 save_128_aligned ((__m128i*)pd,
860 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
871 s = combine1 (ps, pm);
874 *pd++ = core_combine_in_u_pixelsse2 (d, s);
882 static force_inline void
883 core_combine_reverse_in_u_sse2 (uint32_t* pd,
890 __m128i xmm_src_lo, xmm_src_hi;
891 __m128i xmm_dst_lo, xmm_dst_hi;
893 /* call prefetch hint to optimize cache load*/
894 cache_prefetch ((__m128i*)ps);
895 cache_prefetch ((__m128i*)pd);
896 cache_prefetch ((__m128i*)pm);
898 while (w && ((unsigned long) pd & 15))
900 s = combine1 (ps, pm);
903 *pd++ = core_combine_in_u_pixelsse2 (s, d);
910 /* call prefetch hint to optimize cache load*/
911 cache_prefetch ((__m128i*)ps);
912 cache_prefetch ((__m128i*)pd);
913 cache_prefetch ((__m128i*)pm);
917 /* fill cache line with next memory */
918 cache_prefetch_next ((__m128i*)ps);
919 cache_prefetch_next ((__m128i*)pd);
920 cache_prefetch_next ((__m128i*)pm);
922 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
923 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
925 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
926 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
928 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
929 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
930 &xmm_src_lo, &xmm_src_hi,
931 &xmm_dst_lo, &xmm_dst_hi);
934 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
945 s = combine1 (ps, pm);
948 *pd++ = core_combine_in_u_pixelsse2 (s, d);
956 static force_inline void
957 core_combine_reverse_out_u_sse2 (uint32_t* pd,
962 /* call prefetch hint to optimize cache load*/
963 cache_prefetch ((__m128i*)ps);
964 cache_prefetch ((__m128i*)pd);
965 cache_prefetch ((__m128i*)pm);
967 while (w && ((unsigned long) pd & 15))
969 uint32_t s = combine1 (ps, pm);
972 *pd++ = pack_1x64_32 (
974 unpack_32_1x64 (d), negate_1x64 (
975 expand_alpha_1x64 (unpack_32_1x64 (s)))));
983 /* call prefetch hint to optimize cache load*/
984 cache_prefetch ((__m128i*)ps);
985 cache_prefetch ((__m128i*)pd);
986 cache_prefetch ((__m128i*)pm);
990 __m128i xmm_src_lo, xmm_src_hi;
991 __m128i xmm_dst_lo, xmm_dst_hi;
993 /* fill cache line with next memory */
994 cache_prefetch_next ((__m128i*)ps);
995 cache_prefetch_next ((__m128i*)pd);
996 cache_prefetch_next ((__m128i*)pm);
998 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
999 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1001 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1002 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1004 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1005 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1007 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1008 &xmm_src_lo, &xmm_src_hi,
1009 &xmm_dst_lo, &xmm_dst_hi);
1012 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1024 uint32_t s = combine1 (ps, pm);
1027 *pd++ = pack_1x64_32 (
1029 unpack_32_1x64 (d), negate_1x64 (
1030 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1038 static force_inline void
1039 core_combine_out_u_sse2 (uint32_t* pd,
1044 /* call prefetch hint to optimize cache load*/
1045 cache_prefetch ((__m128i*)ps);
1046 cache_prefetch ((__m128i*)pd);
1047 cache_prefetch ((__m128i*)pm);
1049 while (w && ((unsigned long) pd & 15))
1051 uint32_t s = combine1 (ps, pm);
1054 *pd++ = pack_1x64_32 (
1056 unpack_32_1x64 (s), negate_1x64 (
1057 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1064 /* call prefetch hint to optimize cache load*/
1065 cache_prefetch ((__m128i*)ps);
1066 cache_prefetch ((__m128i*)pd);
1067 cache_prefetch ((__m128i*)pm);
1071 __m128i xmm_src_lo, xmm_src_hi;
1072 __m128i xmm_dst_lo, xmm_dst_hi;
1074 /* fill cache line with next memory */
1075 cache_prefetch_next ((__m128i*)ps);
1076 cache_prefetch_next ((__m128i*)pd);
1077 cache_prefetch_next ((__m128i*)pm);
1079 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1080 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1082 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1083 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1085 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1086 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1088 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1089 &xmm_dst_lo, &xmm_dst_hi,
1090 &xmm_dst_lo, &xmm_dst_hi);
1093 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1104 uint32_t s = combine1 (ps, pm);
1107 *pd++ = pack_1x64_32 (
1109 unpack_32_1x64 (s), negate_1x64 (
1110 expand_alpha_1x64 (unpack_32_1x64 (d)))));
1118 static force_inline uint32_t
1119 core_combine_atop_u_pixel_sse2 (uint32_t src,
1122 __m64 s = unpack_32_1x64 (src);
1123 __m64 d = unpack_32_1x64 (dst);
1125 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1126 __m64 da = expand_alpha_1x64 (d);
1128 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1131 static force_inline void
1132 core_combine_atop_u_sse2 (uint32_t* pd,
1139 __m128i xmm_src_lo, xmm_src_hi;
1140 __m128i xmm_dst_lo, xmm_dst_hi;
1141 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1142 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1144 /* call prefetch hint to optimize cache load*/
1145 cache_prefetch ((__m128i*)ps);
1146 cache_prefetch ((__m128i*)pd);
1147 cache_prefetch ((__m128i*)pm);
1149 while (w && ((unsigned long) pd & 15))
1151 s = combine1 (ps, pm);
1154 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1161 /* call prefetch hint to optimize cache load*/
1162 cache_prefetch ((__m128i*)ps);
1163 cache_prefetch ((__m128i*)pd);
1164 cache_prefetch ((__m128i*)pm);
1168 /* fill cache line with next memory */
1169 cache_prefetch_next ((__m128i*)ps);
1170 cache_prefetch_next ((__m128i*)pd);
1171 cache_prefetch_next ((__m128i*)pm);
1173 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1174 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1176 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1177 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1179 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1180 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1181 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1182 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1184 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1185 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1187 pix_add_multiply_2x128 (
1188 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1189 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1190 &xmm_dst_lo, &xmm_dst_hi);
1193 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1204 s = combine1 (ps, pm);
1207 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1215 static force_inline uint32_t
1216 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1219 __m64 s = unpack_32_1x64 (src);
1220 __m64 d = unpack_32_1x64 (dst);
1222 __m64 sa = expand_alpha_1x64 (s);
1223 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1225 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1228 static force_inline void
1229 core_combine_reverse_atop_u_sse2 (uint32_t* pd,
1236 __m128i xmm_src_lo, xmm_src_hi;
1237 __m128i xmm_dst_lo, xmm_dst_hi;
1238 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1239 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1241 /* call prefetch hint to optimize cache load*/
1242 cache_prefetch ((__m128i*)ps);
1243 cache_prefetch ((__m128i*)pd);
1244 cache_prefetch ((__m128i*)pm);
1246 while (w && ((unsigned long) pd & 15))
1248 s = combine1 (ps, pm);
1251 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1258 /* call prefetch hint to optimize cache load*/
1259 cache_prefetch ((__m128i*)ps);
1260 cache_prefetch ((__m128i*)pd);
1261 cache_prefetch ((__m128i*)pm);
1265 /* fill cache line with next memory */
1266 cache_prefetch_next ((__m128i*)ps);
1267 cache_prefetch_next ((__m128i*)pd);
1268 cache_prefetch_next ((__m128i*)pm);
1270 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1271 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1273 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1274 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1276 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1277 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1278 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1279 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1281 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1282 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1284 pix_add_multiply_2x128 (
1285 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1286 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1287 &xmm_dst_lo, &xmm_dst_hi);
1290 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1301 s = combine1 (ps, pm);
1304 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1312 static force_inline uint32_t
1313 core_combine_xor_u_pixel_sse2 (uint32_t src,
1316 __m64 s = unpack_32_1x64 (src);
1317 __m64 d = unpack_32_1x64 (dst);
1319 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1320 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1322 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1325 static force_inline void
1326 core_combine_xor_u_sse2 (uint32_t* dst,
1327 const uint32_t* src,
1328 const uint32_t *mask,
1334 const uint32_t* ps = src;
1335 const uint32_t* pm = mask;
1337 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1338 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1339 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1340 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1342 /* call prefetch hint to optimize cache load*/
1343 cache_prefetch ((__m128i*)ps);
1344 cache_prefetch ((__m128i*)pd);
1345 cache_prefetch ((__m128i*)pm);
1347 while (w && ((unsigned long) pd & 15))
1349 s = combine1 (ps, pm);
1352 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1359 /* call prefetch hint to optimize cache load*/
1360 cache_prefetch ((__m128i*)ps);
1361 cache_prefetch ((__m128i*)pd);
1362 cache_prefetch ((__m128i*)pm);
1366 /* fill cache line with next memory */
1367 cache_prefetch_next ((__m128i*)ps);
1368 cache_prefetch_next ((__m128i*)pd);
1369 cache_prefetch_next ((__m128i*)pm);
1371 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1372 xmm_dst = load_128_aligned ((__m128i*) pd);
1374 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1375 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1377 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1378 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1379 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1380 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1382 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1383 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1384 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1385 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1387 pix_add_multiply_2x128 (
1388 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1389 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1390 &xmm_dst_lo, &xmm_dst_hi);
1393 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1404 s = combine1 (ps, pm);
1407 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1415 static force_inline void
1416 core_combine_add_u_sse2 (uint32_t* dst,
1417 const uint32_t* src,
1418 const uint32_t* mask,
1424 const uint32_t* ps = src;
1425 const uint32_t* pm = mask;
1427 /* call prefetch hint to optimize cache load*/
1428 cache_prefetch ((__m128i*)ps);
1429 cache_prefetch ((__m128i*)pd);
1430 cache_prefetch ((__m128i*)pm);
1432 while (w && (unsigned long)pd & 15)
1434 s = combine1 (ps, pm);
1440 *pd++ = _mm_cvtsi64_si32 (
1441 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1445 /* call prefetch hint to optimize cache load*/
1446 cache_prefetch ((__m128i*)ps);
1447 cache_prefetch ((__m128i*)pd);
1448 cache_prefetch ((__m128i*)pm);
1454 /* fill cache line with next memory */
1455 cache_prefetch_next ((__m128i*)ps);
1456 cache_prefetch_next ((__m128i*)pd);
1457 cache_prefetch_next ((__m128i*)pm);
1459 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1462 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1473 s = combine1 (ps, pm);
1477 *pd++ = _mm_cvtsi64_si32 (
1478 _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1484 static force_inline uint32_t
1485 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1488 __m64 ms = unpack_32_1x64 (src);
1489 __m64 md = unpack_32_1x64 (dst);
1490 uint32_t sa = src >> 24;
1491 uint32_t da = ~dst >> 24;
1495 ms = pix_multiply_1x64 (
1496 ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1499 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1502 static force_inline void
1503 core_combine_saturate_u_sse2 (uint32_t * pd,
1511 __m128i xmm_src, xmm_dst;
1513 /* call prefetch hint to optimize cache load*/
1514 cache_prefetch ((__m128i*)ps);
1515 cache_prefetch ((__m128i*)pd);
1516 cache_prefetch ((__m128i*)pm);
1518 while (w && (unsigned long)pd & 15)
1520 s = combine1 (ps, pm);
1523 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1530 /* call prefetch hint to optimize cache load*/
1531 cache_prefetch ((__m128i*)ps);
1532 cache_prefetch ((__m128i*)pd);
1533 cache_prefetch ((__m128i*)pm);
1537 /* fill cache line with next memory */
1538 cache_prefetch_next ((__m128i*)ps);
1539 cache_prefetch_next ((__m128i*)pd);
1540 cache_prefetch_next ((__m128i*)pm);
1542 xmm_dst = load_128_aligned ((__m128i*)pd);
1543 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1545 pack_cmp = _mm_movemask_epi8 (
1547 _mm_srli_epi32 (xmm_src, 24),
1548 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1550 /* if some alpha src is grater than respective ~alpha dst */
1553 s = combine1 (ps++, pm);
1555 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1559 s = combine1 (ps++, pm);
1561 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1565 s = combine1 (ps++, pm);
1567 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1571 s = combine1 (ps++, pm);
1573 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1579 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1592 s = combine1 (ps, pm);
1595 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1602 static force_inline void
1603 core_combine_src_ca_sse2 (uint32_t* pd,
1610 __m128i xmm_src_lo, xmm_src_hi;
1611 __m128i xmm_mask_lo, xmm_mask_hi;
1612 __m128i xmm_dst_lo, xmm_dst_hi;
1614 /* call prefetch hint to optimize cache load*/
1615 cache_prefetch ((__m128i*)ps);
1616 cache_prefetch ((__m128i*)pd);
1617 cache_prefetch ((__m128i*)pm);
1619 while (w && (unsigned long)pd & 15)
1623 *pd++ = pack_1x64_32 (
1624 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1628 /* call prefetch hint to optimize cache load*/
1629 cache_prefetch ((__m128i*)ps);
1630 cache_prefetch ((__m128i*)pd);
1631 cache_prefetch ((__m128i*)pm);
1635 /* fill cache line with next memory */
1636 cache_prefetch_next ((__m128i*)ps);
1637 cache_prefetch_next ((__m128i*)pd);
1638 cache_prefetch_next ((__m128i*)pm);
1640 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1641 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1643 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1644 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1646 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1647 &xmm_mask_lo, &xmm_mask_hi,
1648 &xmm_dst_lo, &xmm_dst_hi);
1651 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1663 *pd++ = pack_1x64_32 (
1664 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1669 static force_inline uint32_t
1670 core_combine_over_ca_pixel_sse2 (uint32_t src,
1674 __m64 s = unpack_32_1x64 (src);
1675 __m64 expAlpha = expand_alpha_1x64 (s);
1676 __m64 unpk_mask = unpack_32_1x64 (mask);
1677 __m64 unpk_dst = unpack_32_1x64 (dst);
1679 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1682 static force_inline void
1683 core_combine_over_ca_sse2 (uint32_t* pd,
1690 __m128i xmm_alpha_lo, xmm_alpha_hi;
1691 __m128i xmm_src_lo, xmm_src_hi;
1692 __m128i xmm_dst_lo, xmm_dst_hi;
1693 __m128i xmm_mask_lo, xmm_mask_hi;
1695 /* call prefetch hint to optimize cache load*/
1696 cache_prefetch ((__m128i*)ps);
1697 cache_prefetch ((__m128i*)pd);
1698 cache_prefetch ((__m128i*)pm);
1700 while (w && (unsigned long)pd & 15)
1706 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1710 /* call prefetch hint to optimize cache load*/
1711 cache_prefetch ((__m128i*)ps);
1712 cache_prefetch ((__m128i*)pd);
1713 cache_prefetch ((__m128i*)pm);
1717 /* fill cache line with next memory */
1718 cache_prefetch_next ((__m128i*)ps);
1719 cache_prefetch_next ((__m128i*)pd);
1720 cache_prefetch_next ((__m128i*)pm);
1722 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1723 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1724 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1726 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1727 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1728 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1730 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1731 &xmm_alpha_lo, &xmm_alpha_hi);
1733 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1734 &xmm_alpha_lo, &xmm_alpha_hi,
1735 &xmm_mask_lo, &xmm_mask_hi,
1736 &xmm_dst_lo, &xmm_dst_hi);
1739 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1753 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1758 static force_inline uint32_t
1759 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1763 __m64 d = unpack_32_1x64 (dst);
1765 return pack_1x64_32 (
1766 over_1x64 (d, expand_alpha_1x64 (d),
1767 pix_multiply_1x64 (unpack_32_1x64 (src),
1768 unpack_32_1x64 (mask))));
1771 static force_inline void
1772 core_combine_over_reverse_ca_sse2 (uint32_t* pd,
1779 __m128i xmm_alpha_lo, xmm_alpha_hi;
1780 __m128i xmm_src_lo, xmm_src_hi;
1781 __m128i xmm_dst_lo, xmm_dst_hi;
1782 __m128i xmm_mask_lo, xmm_mask_hi;
1784 /* call prefetch hint to optimize cache load*/
1785 cache_prefetch ((__m128i*)ps);
1786 cache_prefetch ((__m128i*)pd);
1787 cache_prefetch ((__m128i*)pm);
1789 while (w && (unsigned long)pd & 15)
1795 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1799 /* call prefetch hint to optimize cache load*/
1800 cache_prefetch ((__m128i*)ps);
1801 cache_prefetch ((__m128i*)pd);
1802 cache_prefetch ((__m128i*)pm);
1806 /* fill cache line with next memory */
1807 cache_prefetch_next ((__m128i*)ps);
1808 cache_prefetch_next ((__m128i*)pd);
1809 cache_prefetch_next ((__m128i*)pm);
1811 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1812 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1813 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1815 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1816 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1817 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1819 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1820 &xmm_alpha_lo, &xmm_alpha_hi);
1821 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1822 &xmm_mask_lo, &xmm_mask_hi,
1823 &xmm_mask_lo, &xmm_mask_hi);
1825 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1826 &xmm_alpha_lo, &xmm_alpha_hi,
1827 &xmm_mask_lo, &xmm_mask_hi);
1830 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1844 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1849 static force_inline void
1850 core_combine_in_ca_sse2 (uint32_t * pd,
1857 __m128i xmm_alpha_lo, xmm_alpha_hi;
1858 __m128i xmm_src_lo, xmm_src_hi;
1859 __m128i xmm_dst_lo, xmm_dst_hi;
1860 __m128i xmm_mask_lo, xmm_mask_hi;
1862 /* call prefetch hint to optimize cache load*/
1863 cache_prefetch ((__m128i*)ps);
1864 cache_prefetch ((__m128i*)pd);
1865 cache_prefetch ((__m128i*)pm);
1867 while (w && (unsigned long)pd & 15)
1873 *pd++ = pack_1x64_32 (
1875 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1876 expand_alpha_1x64 (unpack_32_1x64 (d))));
1881 /* call prefetch hint to optimize cache load*/
1882 cache_prefetch ((__m128i*)ps);
1883 cache_prefetch ((__m128i*)pd);
1884 cache_prefetch ((__m128i*)pm);
1888 /* fill cache line with next memory */
1889 cache_prefetch_next ((__m128i*)ps);
1890 cache_prefetch_next ((__m128i*)pd);
1891 cache_prefetch_next ((__m128i*)pm);
1893 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1894 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1895 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1897 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1898 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1899 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1901 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1902 &xmm_alpha_lo, &xmm_alpha_hi);
1904 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1905 &xmm_mask_lo, &xmm_mask_hi,
1906 &xmm_dst_lo, &xmm_dst_hi);
1908 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1909 &xmm_alpha_lo, &xmm_alpha_hi,
1910 &xmm_dst_lo, &xmm_dst_hi);
1913 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1927 *pd++ = pack_1x64_32 (
1930 unpack_32_1x64 (s), unpack_32_1x64 (m)),
1931 expand_alpha_1x64 (unpack_32_1x64 (d))));
1937 static force_inline void
1938 core_combine_in_reverse_ca_sse2 (uint32_t * pd,
1945 __m128i xmm_alpha_lo, xmm_alpha_hi;
1946 __m128i xmm_src_lo, xmm_src_hi;
1947 __m128i xmm_dst_lo, xmm_dst_hi;
1948 __m128i xmm_mask_lo, xmm_mask_hi;
1950 /* call prefetch hint to optimize cache load*/
1951 cache_prefetch ((__m128i*)ps);
1952 cache_prefetch ((__m128i*)pd);
1953 cache_prefetch ((__m128i*)pm);
1955 while (w && (unsigned long)pd & 15)
1961 *pd++ = pack_1x64_32 (
1964 pix_multiply_1x64 (unpack_32_1x64 (m),
1965 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1969 /* call prefetch hint to optimize cache load*/
1970 cache_prefetch ((__m128i*)ps);
1971 cache_prefetch ((__m128i*)pd);
1972 cache_prefetch ((__m128i*)pm);
1976 /* fill cache line with next memory */
1977 cache_prefetch_next ((__m128i*)ps);
1978 cache_prefetch_next ((__m128i*)pd);
1979 cache_prefetch_next ((__m128i*)pm);
1981 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1982 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1983 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1985 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1986 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1987 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1989 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1990 &xmm_alpha_lo, &xmm_alpha_hi);
1991 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1992 &xmm_alpha_lo, &xmm_alpha_hi,
1993 &xmm_alpha_lo, &xmm_alpha_hi);
1995 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1996 &xmm_alpha_lo, &xmm_alpha_hi,
1997 &xmm_dst_lo, &xmm_dst_hi);
2000 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2014 *pd++ = pack_1x64_32 (
2017 pix_multiply_1x64 (unpack_32_1x64 (m),
2018 expand_alpha_1x64 (unpack_32_1x64 (s)))));
2023 static force_inline void
2024 core_combine_out_ca_sse2 (uint32_t * pd,
2031 __m128i xmm_alpha_lo, xmm_alpha_hi;
2032 __m128i xmm_src_lo, xmm_src_hi;
2033 __m128i xmm_dst_lo, xmm_dst_hi;
2034 __m128i xmm_mask_lo, xmm_mask_hi;
2036 /* call prefetch hint to optimize cache load*/
2037 cache_prefetch ((__m128i*)ps);
2038 cache_prefetch ((__m128i*)pd);
2039 cache_prefetch ((__m128i*)pm);
2041 while (w && (unsigned long)pd & 15)
2047 *pd++ = pack_1x64_32 (
2050 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2051 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2055 /* call prefetch hint to optimize cache load*/
2056 cache_prefetch ((__m128i*)ps);
2057 cache_prefetch ((__m128i*)pd);
2058 cache_prefetch ((__m128i*)pm);
2062 /* fill cache line with next memory */
2063 cache_prefetch_next ((__m128i*)ps);
2064 cache_prefetch_next ((__m128i*)pd);
2065 cache_prefetch_next ((__m128i*)pm);
2067 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2068 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2069 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2071 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2072 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2073 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2075 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2076 &xmm_alpha_lo, &xmm_alpha_hi);
2077 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2078 &xmm_alpha_lo, &xmm_alpha_hi);
2080 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2081 &xmm_mask_lo, &xmm_mask_hi,
2082 &xmm_dst_lo, &xmm_dst_hi);
2083 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2084 &xmm_alpha_lo, &xmm_alpha_hi,
2085 &xmm_dst_lo, &xmm_dst_hi);
2088 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2102 *pd++ = pack_1x64_32 (
2105 unpack_32_1x64 (s), unpack_32_1x64 (m)),
2106 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2112 static force_inline void
2113 core_combine_out_reverse_ca_sse2 (uint32_t * pd,
2120 __m128i xmm_alpha_lo, xmm_alpha_hi;
2121 __m128i xmm_src_lo, xmm_src_hi;
2122 __m128i xmm_dst_lo, xmm_dst_hi;
2123 __m128i xmm_mask_lo, xmm_mask_hi;
2125 /* call prefetch hint to optimize cache load*/
2126 cache_prefetch ((__m128i*)ps);
2127 cache_prefetch ((__m128i*)pd);
2128 cache_prefetch ((__m128i*)pm);
2130 while (w && (unsigned long)pd & 15)
2136 *pd++ = pack_1x64_32 (
2139 negate_1x64 (pix_multiply_1x64 (
2141 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2145 /* call prefetch hint to optimize cache load*/
2146 cache_prefetch ((__m128i*)ps);
2147 cache_prefetch ((__m128i*)pd);
2148 cache_prefetch ((__m128i*)pm);
2152 /* fill cache line with next memory */
2153 cache_prefetch_next ((__m128i*)ps);
2154 cache_prefetch_next ((__m128i*)pd);
2155 cache_prefetch_next ((__m128i*)pm);
2157 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2158 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2159 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2161 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2162 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2163 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2165 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2166 &xmm_alpha_lo, &xmm_alpha_hi);
2168 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2169 &xmm_alpha_lo, &xmm_alpha_hi,
2170 &xmm_mask_lo, &xmm_mask_hi);
2172 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2173 &xmm_mask_lo, &xmm_mask_hi);
2175 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2176 &xmm_mask_lo, &xmm_mask_hi,
2177 &xmm_dst_lo, &xmm_dst_hi);
2180 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2194 *pd++ = pack_1x64_32 (
2197 negate_1x64 (pix_multiply_1x64 (
2199 expand_alpha_1x64 (unpack_32_1x64 (s))))));
2204 static force_inline uint32_t
2205 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2209 __m64 m = unpack_32_1x64 (mask);
2210 __m64 s = unpack_32_1x64 (src);
2211 __m64 d = unpack_32_1x64 (dst);
2212 __m64 sa = expand_alpha_1x64 (s);
2213 __m64 da = expand_alpha_1x64 (d);
2215 s = pix_multiply_1x64 (s, m);
2216 m = negate_1x64 (pix_multiply_1x64 (m, sa));
2218 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2221 static force_inline void
2222 core_combine_atop_ca_sse2 (uint32_t * pd,
2229 __m128i xmm_src_lo, xmm_src_hi;
2230 __m128i xmm_dst_lo, xmm_dst_hi;
2231 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2232 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2233 __m128i xmm_mask_lo, xmm_mask_hi;
2235 /* call prefetch hint to optimize cache load*/
2236 cache_prefetch ((__m128i*)ps);
2237 cache_prefetch ((__m128i*)pd);
2238 cache_prefetch ((__m128i*)pm);
2240 while (w && (unsigned long)pd & 15)
2246 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2250 /* call prefetch hint to optimize cache load*/
2251 cache_prefetch ((__m128i*)ps);
2252 cache_prefetch ((__m128i*)pd);
2253 cache_prefetch ((__m128i*)pm);
2257 /* fill cache line with next memory */
2258 cache_prefetch_next ((__m128i*)ps);
2259 cache_prefetch_next ((__m128i*)pd);
2260 cache_prefetch_next ((__m128i*)pm);
2262 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2263 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2264 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2266 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2267 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2268 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2270 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2271 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2272 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2273 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2275 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2276 &xmm_mask_lo, &xmm_mask_hi,
2277 &xmm_src_lo, &xmm_src_hi);
2278 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2279 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2280 &xmm_mask_lo, &xmm_mask_hi);
2282 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2284 pix_add_multiply_2x128 (
2285 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2286 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2287 &xmm_dst_lo, &xmm_dst_hi);
2290 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2304 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2309 static force_inline uint32_t
2310 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2314 __m64 m = unpack_32_1x64 (mask);
2315 __m64 s = unpack_32_1x64 (src);
2316 __m64 d = unpack_32_1x64 (dst);
2318 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2319 __m64 sa = expand_alpha_1x64 (s);
2321 s = pix_multiply_1x64 (s, m);
2322 m = pix_multiply_1x64 (m, sa);
2324 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2327 static force_inline void
2328 core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
2335 __m128i xmm_src_lo, xmm_src_hi;
2336 __m128i xmm_dst_lo, xmm_dst_hi;
2337 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2338 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2339 __m128i xmm_mask_lo, xmm_mask_hi;
2341 /* call prefetch hint to optimize cache load*/
2342 cache_prefetch ((__m128i*)ps);
2343 cache_prefetch ((__m128i*)pd);
2344 cache_prefetch ((__m128i*)pm);
2346 while (w && (unsigned long)pd & 15)
2352 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2356 /* call prefetch hint to optimize cache load*/
2357 cache_prefetch ((__m128i*)ps);
2358 cache_prefetch ((__m128i*)pd);
2359 cache_prefetch ((__m128i*)pm);
2363 /* fill cache line with next memory */
2364 cache_prefetch_next ((__m128i*)ps);
2365 cache_prefetch_next ((__m128i*)pd);
2366 cache_prefetch_next ((__m128i*)pm);
2368 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2369 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2370 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2372 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2373 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2374 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2376 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2377 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2378 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2379 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2381 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2382 &xmm_mask_lo, &xmm_mask_hi,
2383 &xmm_src_lo, &xmm_src_hi);
2384 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2385 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2386 &xmm_mask_lo, &xmm_mask_hi);
2388 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2389 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2391 pix_add_multiply_2x128 (
2392 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2393 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2394 &xmm_dst_lo, &xmm_dst_hi);
2397 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2411 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2416 static force_inline uint32_t
2417 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2421 __m64 a = unpack_32_1x64 (mask);
2422 __m64 s = unpack_32_1x64 (src);
2423 __m64 d = unpack_32_1x64 (dst);
2425 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2426 a, expand_alpha_1x64 (s)));
2427 __m64 dest = pix_multiply_1x64 (s, a);
2428 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2430 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2436 static force_inline void
2437 core_combine_xor_ca_sse2 (uint32_t * pd,
2444 __m128i xmm_src_lo, xmm_src_hi;
2445 __m128i xmm_dst_lo, xmm_dst_hi;
2446 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2447 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2448 __m128i xmm_mask_lo, xmm_mask_hi;
2450 /* call prefetch hint to optimize cache load*/
2451 cache_prefetch ((__m128i*)ps);
2452 cache_prefetch ((__m128i*)pd);
2453 cache_prefetch ((__m128i*)pm);
2455 while (w && (unsigned long)pd & 15)
2461 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2465 /* call prefetch hint to optimize cache load*/
2466 cache_prefetch ((__m128i*)ps);
2467 cache_prefetch ((__m128i*)pd);
2468 cache_prefetch ((__m128i*)pm);
2472 /* fill cache line with next memory */
2473 cache_prefetch_next ((__m128i*)ps);
2474 cache_prefetch_next ((__m128i*)pd);
2475 cache_prefetch_next ((__m128i*)pm);
2477 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2478 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2479 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2481 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2482 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2483 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2485 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2486 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2487 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2488 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2490 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2491 &xmm_mask_lo, &xmm_mask_hi,
2492 &xmm_src_lo, &xmm_src_hi);
2493 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2494 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2495 &xmm_mask_lo, &xmm_mask_hi);
2497 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2498 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2499 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2500 &xmm_mask_lo, &xmm_mask_hi);
2502 pix_add_multiply_2x128 (
2503 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2504 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2505 &xmm_dst_lo, &xmm_dst_hi);
2508 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2522 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2527 static force_inline void
2528 core_combine_add_ca_sse2 (uint32_t * pd,
2535 __m128i xmm_src_lo, xmm_src_hi;
2536 __m128i xmm_dst_lo, xmm_dst_hi;
2537 __m128i xmm_mask_lo, xmm_mask_hi;
2539 /* call prefetch hint to optimize cache load*/
2540 cache_prefetch ((__m128i*)ps);
2541 cache_prefetch ((__m128i*)pd);
2542 cache_prefetch ((__m128i*)pm);
2544 while (w && (unsigned long)pd & 15)
2550 *pd++ = pack_1x64_32 (
2551 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2552 unpack_32_1x64 (m)),
2553 unpack_32_1x64 (d)));
2557 /* call prefetch hint to optimize cache load*/
2558 cache_prefetch ((__m128i*)ps);
2559 cache_prefetch ((__m128i*)pd);
2560 cache_prefetch ((__m128i*)pm);
2564 /* fill cache line with next memory */
2565 cache_prefetch_next ((__m128i*)ps);
2566 cache_prefetch_next ((__m128i*)pd);
2567 cache_prefetch_next ((__m128i*)pm);
2569 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2570 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2571 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2573 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2574 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2575 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2577 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2578 &xmm_mask_lo, &xmm_mask_hi,
2579 &xmm_src_lo, &xmm_src_hi);
2582 (__m128i*)pd, pack_2x128_128 (
2583 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2584 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2598 *pd++ = pack_1x64_32 (
2599 _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2600 unpack_32_1x64 (m)),
2601 unpack_32_1x64 (d)));
2606 /* ---------------------------------------------------
2607 * fb_compose_setup_sSE2
2609 static force_inline __m64
2610 create_mask_16_64 (uint16_t mask)
2612 return _mm_set1_pi16 (mask);
2615 static force_inline __m128i
2616 create_mask_16_128 (uint16_t mask)
2618 return _mm_set1_epi16 (mask);
2621 static force_inline __m64
2622 create_mask_2x32_64 (uint32_t mask0,
2625 return _mm_set_pi32 (mask0, mask1);
2628 static force_inline __m128i
2629 create_mask_2x32_128 (uint32_t mask0,
2632 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2635 /* SSE2 code patch for fbcompose.c */
2638 sse2_combine_over_u (pixman_implementation_t *imp,
2641 const uint32_t * src,
2642 const uint32_t * mask,
2645 core_combine_over_u_sse2 (dst, src, mask, width);
2650 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2653 const uint32_t * src,
2654 const uint32_t * mask,
2657 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2662 sse2_combine_in_u (pixman_implementation_t *imp,
2665 const uint32_t * src,
2666 const uint32_t * mask,
2669 core_combine_in_u_sse2 (dst, src, mask, width);
2674 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2677 const uint32_t * src,
2678 const uint32_t * mask,
2681 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2686 sse2_combine_out_u (pixman_implementation_t *imp,
2689 const uint32_t * src,
2690 const uint32_t * mask,
2693 core_combine_out_u_sse2 (dst, src, mask, width);
2698 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2701 const uint32_t * src,
2702 const uint32_t * mask,
2705 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2710 sse2_combine_atop_u (pixman_implementation_t *imp,
2713 const uint32_t * src,
2714 const uint32_t * mask,
2717 core_combine_atop_u_sse2 (dst, src, mask, width);
2722 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2725 const uint32_t * src,
2726 const uint32_t * mask,
2729 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2734 sse2_combine_xor_u (pixman_implementation_t *imp,
2737 const uint32_t * src,
2738 const uint32_t * mask,
2741 core_combine_xor_u_sse2 (dst, src, mask, width);
2746 sse2_combine_add_u (pixman_implementation_t *imp,
2749 const uint32_t * src,
2750 const uint32_t * mask,
2753 core_combine_add_u_sse2 (dst, src, mask, width);
2758 sse2_combine_saturate_u (pixman_implementation_t *imp,
2761 const uint32_t * src,
2762 const uint32_t * mask,
2765 core_combine_saturate_u_sse2 (dst, src, mask, width);
2770 sse2_combine_src_ca (pixman_implementation_t *imp,
2773 const uint32_t * src,
2774 const uint32_t * mask,
2777 core_combine_src_ca_sse2 (dst, src, mask, width);
2782 sse2_combine_over_ca (pixman_implementation_t *imp,
2785 const uint32_t * src,
2786 const uint32_t * mask,
2789 core_combine_over_ca_sse2 (dst, src, mask, width);
2794 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2797 const uint32_t * src,
2798 const uint32_t * mask,
2801 core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2806 sse2_combine_in_ca (pixman_implementation_t *imp,
2809 const uint32_t * src,
2810 const uint32_t * mask,
2813 core_combine_in_ca_sse2 (dst, src, mask, width);
2818 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2821 const uint32_t * src,
2822 const uint32_t * mask,
2825 core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2830 sse2_combine_out_ca (pixman_implementation_t *imp,
2833 const uint32_t * src,
2834 const uint32_t * mask,
2837 core_combine_out_ca_sse2 (dst, src, mask, width);
2842 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2845 const uint32_t * src,
2846 const uint32_t * mask,
2849 core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2854 sse2_combine_atop_ca (pixman_implementation_t *imp,
2857 const uint32_t * src,
2858 const uint32_t * mask,
2861 core_combine_atop_ca_sse2 (dst, src, mask, width);
2866 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2869 const uint32_t * src,
2870 const uint32_t * mask,
2873 core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2878 sse2_combine_xor_ca (pixman_implementation_t *imp,
2881 const uint32_t * src,
2882 const uint32_t * mask,
2885 core_combine_xor_ca_sse2 (dst, src, mask, width);
2890 sse2_combine_add_ca (pixman_implementation_t *imp,
2893 const uint32_t * src,
2894 const uint32_t * mask,
2897 core_combine_add_ca_sse2 (dst, src, mask, width);
2901 /* -------------------------------------------------------------------
2902 * composite_over_n_8888
2906 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2908 pixman_image_t * src_image,
2909 pixman_image_t * mask_image,
2910 pixman_image_t * dst_image,
2921 uint32_t *dst_line, *dst, d;
2924 __m128i xmm_src, xmm_alpha;
2925 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2927 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2932 PIXMAN_IMAGE_GET_LINE (
2933 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2935 xmm_src = expand_pixel_32_1x128 (src);
2936 xmm_alpha = expand_alpha_1x128 (xmm_src);
2942 /* call prefetch hint to optimize cache load*/
2943 cache_prefetch ((__m128i*)dst);
2945 dst_line += dst_stride;
2948 while (w && (unsigned long)dst & 15)
2951 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2952 _mm_movepi64_pi64 (xmm_alpha),
2953 unpack_32_1x64 (d)));
2957 cache_prefetch ((__m128i*)dst);
2961 /* fill cache line with next memory */
2962 cache_prefetch_next ((__m128i*)dst);
2964 xmm_dst = load_128_aligned ((__m128i*)dst);
2966 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2968 over_2x128 (&xmm_src, &xmm_src,
2969 &xmm_alpha, &xmm_alpha,
2970 &xmm_dst_lo, &xmm_dst_hi);
2972 /* rebuid the 4 pixel data and save*/
2974 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2983 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2984 _mm_movepi64_pi64 (xmm_alpha),
2985 unpack_32_1x64 (d)));
2993 /* ---------------------------------------------------------------------
2994 * composite_over_n_0565
2997 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2999 pixman_image_t * src_image,
3000 pixman_image_t * mask_image,
3001 pixman_image_t * dst_image,
3012 uint16_t *dst_line, *dst, d;
3015 __m128i xmm_src, xmm_alpha;
3016 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3018 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3023 PIXMAN_IMAGE_GET_LINE (
3024 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3026 xmm_src = expand_pixel_32_1x128 (src);
3027 xmm_alpha = expand_alpha_1x128 (xmm_src);
3033 /* call prefetch hint to optimize cache load*/
3034 cache_prefetch ((__m128i*)dst);
3036 dst_line += dst_stride;
3039 while (w && (unsigned long)dst & 15)
3043 *dst++ = pack_565_32_16 (
3044 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3045 _mm_movepi64_pi64 (xmm_alpha),
3046 expand565_16_1x64 (d))));
3050 /* call prefetch hint to optimize cache load*/
3051 cache_prefetch ((__m128i*)dst);
3055 /* fill cache line with next memory */
3056 cache_prefetch_next ((__m128i*)dst);
3058 xmm_dst = load_128_aligned ((__m128i*)dst);
3060 unpack_565_128_4x128 (xmm_dst,
3061 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3063 over_2x128 (&xmm_src, &xmm_src,
3064 &xmm_alpha, &xmm_alpha,
3065 &xmm_dst0, &xmm_dst1);
3066 over_2x128 (&xmm_src, &xmm_src,
3067 &xmm_alpha, &xmm_alpha,
3068 &xmm_dst2, &xmm_dst3);
3070 xmm_dst = pack_565_4x128_128 (
3071 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3073 save_128_aligned ((__m128i*)dst, xmm_dst);
3082 *dst++ = pack_565_32_16 (
3083 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3084 _mm_movepi64_pi64 (xmm_alpha),
3085 expand565_16_1x64 (d))));
3092 /* ---------------------------------------------------------------------------
3093 * composite_over_n_8888_8888_ca
3097 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3099 pixman_image_t * src_image,
3100 pixman_image_t * mask_image,
3101 pixman_image_t * dst_image,
3112 uint32_t *dst_line, d;
3113 uint32_t *mask_line, m;
3115 int dst_stride, mask_stride;
3117 __m128i xmm_src, xmm_alpha;
3118 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3119 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3121 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3123 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3128 PIXMAN_IMAGE_GET_LINE (
3129 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3130 PIXMAN_IMAGE_GET_LINE (
3131 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3133 xmm_src = _mm_unpacklo_epi8 (
3134 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3135 xmm_alpha = expand_alpha_1x128 (xmm_src);
3136 mmx_src = _mm_movepi64_pi64 (xmm_src);
3137 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3142 const uint32_t *pm = (uint32_t *)mask_line;
3143 uint32_t *pd = (uint32_t *)dst_line;
3145 dst_line += dst_stride;
3146 mask_line += mask_stride;
3148 /* call prefetch hint to optimize cache load*/
3149 cache_prefetch ((__m128i*)pd);
3150 cache_prefetch ((__m128i*)pm);
3152 while (w && (unsigned long)pd & 15)
3159 mmx_mask = unpack_32_1x64 (m);
3160 mmx_dest = unpack_32_1x64 (d);
3162 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3172 /* call prefetch hint to optimize cache load*/
3173 cache_prefetch ((__m128i*)pd);
3174 cache_prefetch ((__m128i*)pm);
3178 /* fill cache line with next memory */
3179 cache_prefetch_next ((__m128i*)pd);
3180 cache_prefetch_next ((__m128i*)pm);
3182 xmm_mask = load_128_unaligned ((__m128i*)pm);
3186 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3188 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3189 if (pack_cmp != 0xffff)
3191 xmm_dst = load_128_aligned ((__m128i*)pd);
3193 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3194 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3196 in_over_2x128 (&xmm_src, &xmm_src,
3197 &xmm_alpha, &xmm_alpha,
3198 &xmm_mask_lo, &xmm_mask_hi,
3199 &xmm_dst_lo, &xmm_dst_hi);
3202 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3217 mmx_mask = unpack_32_1x64 (m);
3218 mmx_dest = unpack_32_1x64 (d);
3220 *pd = pack_1x64_32 (
3221 in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3232 /*---------------------------------------------------------------------
3233 * composite_over_8888_n_8888
3237 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3239 pixman_image_t * src_image,
3240 pixman_image_t * mask_image,
3241 pixman_image_t * dst_image,
3251 uint32_t *dst_line, *dst;
3252 uint32_t *src_line, *src;
3255 int dst_stride, src_stride;
3258 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3259 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3260 __m128i xmm_alpha_lo, xmm_alpha_hi;
3262 PIXMAN_IMAGE_GET_LINE (
3263 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3264 PIXMAN_IMAGE_GET_LINE (
3265 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3267 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3269 xmm_mask = create_mask_16_128 (mask >> 24);
3274 dst_line += dst_stride;
3276 src_line += src_stride;
3279 /* call prefetch hint to optimize cache load*/
3280 cache_prefetch ((__m128i*)dst);
3281 cache_prefetch ((__m128i*)src);
3283 while (w && (unsigned long)dst & 15)
3285 uint32_t s = *src++;
3288 __m64 ms = unpack_32_1x64 (s);
3289 __m64 alpha = expand_alpha_1x64 (ms);
3290 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
3291 __m64 alpha_dst = unpack_32_1x64 (d);
3293 *dst++ = pack_1x64_32 (
3294 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3299 /* call prefetch hint to optimize cache load*/
3300 cache_prefetch ((__m128i*)dst);
3301 cache_prefetch ((__m128i*)src);
3305 /* fill cache line with next memory */
3306 cache_prefetch_next ((__m128i*)dst);
3307 cache_prefetch_next ((__m128i*)src);
3309 xmm_src = load_128_unaligned ((__m128i*)src);
3310 xmm_dst = load_128_aligned ((__m128i*)dst);
3312 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3313 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3314 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3315 &xmm_alpha_lo, &xmm_alpha_hi);
3317 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3318 &xmm_alpha_lo, &xmm_alpha_hi,
3319 &xmm_mask, &xmm_mask,
3320 &xmm_dst_lo, &xmm_dst_hi);
3323 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3332 uint32_t s = *src++;
3335 __m64 ms = unpack_32_1x64 (s);
3336 __m64 alpha = expand_alpha_1x64 (ms);
3337 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3338 __m64 dest = unpack_32_1x64 (d);
3340 *dst++ = pack_1x64_32 (
3341 in_over_1x64 (&ms, &alpha, &mask, &dest));
3350 /* ---------------------------------------------------------------------
3351 * composite_over_x888_n_8888
3354 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3356 pixman_image_t * src_image,
3357 pixman_image_t * mask_image,
3358 pixman_image_t * dst_image,
3368 uint32_t *dst_line, *dst;
3369 uint32_t *src_line, *src;
3371 int dst_stride, src_stride;
3374 __m128i xmm_mask, xmm_alpha;
3375 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3376 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3378 PIXMAN_IMAGE_GET_LINE (
3379 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3380 PIXMAN_IMAGE_GET_LINE (
3381 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3383 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3385 xmm_mask = create_mask_16_128 (mask >> 24);
3386 xmm_alpha = mask_00ff;
3391 dst_line += dst_stride;
3393 src_line += src_stride;
3396 /* call prefetch hint to optimize cache load*/
3397 cache_prefetch ((__m128i*)dst);
3398 cache_prefetch ((__m128i*)src);
3400 while (w && (unsigned long)dst & 15)
3402 uint32_t s = (*src++) | 0xff000000;
3405 __m64 src = unpack_32_1x64 (s);
3406 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3407 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3408 __m64 dest = unpack_32_1x64 (d);
3410 *dst++ = pack_1x64_32 (
3411 in_over_1x64 (&src, &alpha, &mask, &dest));
3416 /* call prefetch hint to optimize cache load*/
3417 cache_prefetch ((__m128i*)dst);
3418 cache_prefetch ((__m128i*)src);
3422 /* fill cache line with next memory */
3423 cache_prefetch_next ((__m128i*)dst);
3424 cache_prefetch_next ((__m128i*)src);
3426 xmm_src = _mm_or_si128 (
3427 load_128_unaligned ((__m128i*)src), mask_ff000000);
3428 xmm_dst = load_128_aligned ((__m128i*)dst);
3430 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3431 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3433 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3434 &xmm_alpha, &xmm_alpha,
3435 &xmm_mask, &xmm_mask,
3436 &xmm_dst_lo, &xmm_dst_hi);
3439 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3449 uint32_t s = (*src++) | 0xff000000;
3452 __m64 src = unpack_32_1x64 (s);
3453 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3454 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3455 __m64 dest = unpack_32_1x64 (d);
3457 *dst++ = pack_1x64_32 (
3458 in_over_1x64 (&src, &alpha, &mask, &dest));
3467 /* --------------------------------------------------------------------
3468 * composite_over_8888_8888
3471 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3473 pixman_image_t * src_image,
3474 pixman_image_t * mask_image,
3475 pixman_image_t * dst_image,
3485 int dst_stride, src_stride;
3486 uint32_t *dst_line, *dst;
3487 uint32_t *src_line, *src;
3489 PIXMAN_IMAGE_GET_LINE (
3490 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3491 PIXMAN_IMAGE_GET_LINE (
3492 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3499 core_combine_over_u_sse2 (dst, src, NULL, width);
3507 /* ------------------------------------------------------------------
3508 * composite_over_8888_0565
3510 static force_inline uint16_t
3511 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3515 ms = unpack_32_1x64 (src);
3516 return pack_565_32_16 (
3519 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3523 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3525 pixman_image_t * src_image,
3526 pixman_image_t * mask_image,
3527 pixman_image_t * dst_image,
3537 uint16_t *dst_line, *dst, d;
3538 uint32_t *src_line, *src, s;
3539 int dst_stride, src_stride;
3542 __m128i xmm_alpha_lo, xmm_alpha_hi;
3543 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3544 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3546 PIXMAN_IMAGE_GET_LINE (
3547 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3548 PIXMAN_IMAGE_GET_LINE (
3549 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3554 * I copy the code from MMX one and keep the fixme.
3555 * If it's a problem there, probably is a problem here.
3557 assert (src_image->drawable == mask_image->drawable);
3565 /* call prefetch hint to optimize cache load*/
3566 cache_prefetch ((__m128i*)src);
3567 cache_prefetch ((__m128i*)dst);
3569 dst_line += dst_stride;
3570 src_line += src_stride;
3573 /* Align dst on a 16-byte boundary */
3575 ((unsigned long)dst & 15))
3580 *dst++ = composite_over_8888_0565pixel (s, d);
3584 /* call prefetch hint to optimize cache load*/
3585 cache_prefetch ((__m128i*)src);
3586 cache_prefetch ((__m128i*)dst);
3588 /* It's a 8 pixel loop */
3591 /* fill cache line with next memory */
3592 cache_prefetch_next ((__m128i*)src);
3593 cache_prefetch_next ((__m128i*)dst);
3595 /* I'm loading unaligned because I'm not sure
3596 * about the address alignment.
3598 xmm_src = load_128_unaligned ((__m128i*) src);
3599 xmm_dst = load_128_aligned ((__m128i*) dst);
3602 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3603 unpack_565_128_4x128 (xmm_dst,
3604 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3605 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3606 &xmm_alpha_lo, &xmm_alpha_hi);
3608 /* I'm loading next 4 pixels from memory
3609 * before to optimze the memory read.
3611 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3613 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3614 &xmm_alpha_lo, &xmm_alpha_hi,
3615 &xmm_dst0, &xmm_dst1);
3618 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3619 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3620 &xmm_alpha_lo, &xmm_alpha_hi);
3622 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3623 &xmm_alpha_lo, &xmm_alpha_hi,
3624 &xmm_dst2, &xmm_dst3);
3627 (__m128i*)dst, pack_565_4x128_128 (
3628 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3640 *dst++ = composite_over_8888_0565pixel (s, d);
3647 /* -----------------------------------------------------------------
3648 * composite_over_n_8_8888
3652 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3654 pixman_image_t * src_image,
3655 pixman_image_t * mask_image,
3656 pixman_image_t * dst_image,
3667 uint32_t *dst_line, *dst;
3668 uint8_t *mask_line, *mask;
3669 int dst_stride, mask_stride;
3673 __m128i xmm_src, xmm_alpha, xmm_def;
3674 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3675 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3677 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3679 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3685 PIXMAN_IMAGE_GET_LINE (
3686 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3687 PIXMAN_IMAGE_GET_LINE (
3688 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3690 xmm_def = create_mask_2x32_128 (src, src);
3691 xmm_src = expand_pixel_32_1x128 (src);
3692 xmm_alpha = expand_alpha_1x128 (xmm_src);
3693 mmx_src = _mm_movepi64_pi64 (xmm_src);
3694 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3699 dst_line += dst_stride;
3701 mask_line += mask_stride;
3704 /* call prefetch hint to optimize cache load*/
3705 cache_prefetch ((__m128i*)mask);
3706 cache_prefetch ((__m128i*)dst);
3708 while (w && (unsigned long)dst & 15)
3710 uint8_t m = *mask++;
3715 mmx_mask = expand_pixel_8_1x64 (m);
3716 mmx_dest = unpack_32_1x64 (d);
3718 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3728 /* call prefetch hint to optimize cache load*/
3729 cache_prefetch ((__m128i*)mask);
3730 cache_prefetch ((__m128i*)dst);
3734 /* fill cache line with next memory */
3735 cache_prefetch_next ((__m128i*)mask);
3736 cache_prefetch_next ((__m128i*)dst);
3738 m = *((uint32_t*)mask);
3740 if (srca == 0xff && m == 0xffffffff)
3742 save_128_aligned ((__m128i*)dst, xmm_def);
3746 xmm_dst = load_128_aligned ((__m128i*) dst);
3747 xmm_mask = unpack_32_1x128 (m);
3748 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3751 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3752 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3754 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3755 &xmm_mask_lo, &xmm_mask_hi);
3757 in_over_2x128 (&xmm_src, &xmm_src,
3758 &xmm_alpha, &xmm_alpha,
3759 &xmm_mask_lo, &xmm_mask_hi,
3760 &xmm_dst_lo, &xmm_dst_hi);
3763 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3773 uint8_t m = *mask++;
3778 mmx_mask = expand_pixel_8_1x64 (m);
3779 mmx_dest = unpack_32_1x64 (d);
3781 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3795 /* ----------------------------------------------------------------
3796 * composite_over_n_8_8888
3800 pixman_fill_sse2 (uint32_t *bits,
3809 uint32_t byte_width;
3814 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3817 if (bpp != 16 && bpp != 32)
3822 stride = stride * (int) sizeof (uint32_t) / 2;
3823 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3824 byte_width = 2 * width;
3829 stride = stride * (int) sizeof (uint32_t) / 4;
3830 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3831 byte_width = 4 * width;
3835 cache_prefetch ((__m128i*)byte_line);
3836 xmm_def = create_mask_2x32_128 (data, data);
3841 uint8_t *d = byte_line;
3842 byte_line += stride;
3846 cache_prefetch_next ((__m128i*)d);
3848 while (w >= 2 && ((unsigned long)d & 3))
3850 *(uint16_t *)d = data;
3855 while (w >= 4 && ((unsigned long)d & 15))
3857 *(uint32_t *)d = data;
3863 cache_prefetch_next ((__m128i*)d);
3867 cache_prefetch (((__m128i*)d) + 12);
3869 save_128_aligned ((__m128i*)(d), xmm_def);
3870 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3871 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3872 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3873 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3874 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3875 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3876 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3884 cache_prefetch (((__m128i*)d) + 8);
3886 save_128_aligned ((__m128i*)(d), xmm_def);
3887 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3888 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3889 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3895 cache_prefetch_next ((__m128i*)d);
3899 save_128_aligned ((__m128i*)(d), xmm_def);
3900 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3908 save_128_aligned ((__m128i*)(d), xmm_def);
3914 cache_prefetch_next ((__m128i*)d);
3918 *(uint32_t *)d = data;
3926 *(uint16_t *)d = data;
3937 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3939 pixman_image_t * src_image,
3940 pixman_image_t * mask_image,
3941 pixman_image_t * dst_image,
3952 uint32_t *dst_line, *dst;
3953 uint8_t *mask_line, *mask;
3954 int dst_stride, mask_stride;
3958 __m128i xmm_src, xmm_def;
3959 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3961 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3966 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3967 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3968 dest_x, dest_y, width, height, 0);
3972 PIXMAN_IMAGE_GET_LINE (
3973 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3974 PIXMAN_IMAGE_GET_LINE (
3975 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3977 xmm_def = create_mask_2x32_128 (src, src);
3978 xmm_src = expand_pixel_32_1x128 (src);
3983 dst_line += dst_stride;
3985 mask_line += mask_stride;
3988 /* call prefetch hint to optimize cache load*/
3989 cache_prefetch ((__m128i*)mask);
3990 cache_prefetch ((__m128i*)dst);
3992 while (w && (unsigned long)dst & 15)
3994 uint8_t m = *mask++;
3998 *dst = pack_1x64_32 (
4000 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4011 /* call prefetch hint to optimize cache load*/
4012 cache_prefetch ((__m128i*)mask);
4013 cache_prefetch ((__m128i*)dst);
4017 /* fill cache line with next memory */
4018 cache_prefetch_next ((__m128i*)mask);
4019 cache_prefetch_next ((__m128i*)dst);
4021 m = *((uint32_t*)mask);
4023 if (srca == 0xff && m == 0xffffffff)
4025 save_128_aligned ((__m128i*)dst, xmm_def);
4029 xmm_mask = unpack_32_1x128 (m);
4030 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4033 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4035 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4036 &xmm_mask_lo, &xmm_mask_hi);
4038 pix_multiply_2x128 (&xmm_src, &xmm_src,
4039 &xmm_mask_lo, &xmm_mask_hi,
4040 &xmm_mask_lo, &xmm_mask_hi);
4043 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4047 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4057 uint8_t m = *mask++;
4061 *dst = pack_1x64_32 (
4063 _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4078 /*-----------------------------------------------------------------------
4079 * composite_over_n_8_0565
4083 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4085 pixman_image_t * src_image,
4086 pixman_image_t * mask_image,
4087 pixman_image_t * dst_image,
4098 uint16_t *dst_line, *dst, d;
4099 uint8_t *mask_line, *mask;
4100 int dst_stride, mask_stride;
4103 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4105 __m128i xmm_src, xmm_alpha;
4106 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4107 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4109 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4115 PIXMAN_IMAGE_GET_LINE (
4116 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4117 PIXMAN_IMAGE_GET_LINE (
4118 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4120 xmm_src = expand_pixel_32_1x128 (src);
4121 xmm_alpha = expand_alpha_1x128 (xmm_src);
4122 mmx_src = _mm_movepi64_pi64 (xmm_src);
4123 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4128 dst_line += dst_stride;
4130 mask_line += mask_stride;
4133 /* call prefetch hint to optimize cache load*/
4134 cache_prefetch ((__m128i*)mask);
4135 cache_prefetch ((__m128i*)dst);
4137 while (w && (unsigned long)dst & 15)
4144 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4145 mmx_dest = expand565_16_1x64 (d);
4147 *dst = pack_565_32_16 (
4150 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4157 /* call prefetch hint to optimize cache load*/
4158 cache_prefetch ((__m128i*)mask);
4159 cache_prefetch ((__m128i*)dst);
4163 /* fill cache line with next memory */
4164 cache_prefetch_next ((__m128i*)mask);
4165 cache_prefetch_next ((__m128i*)dst);
4167 xmm_dst = load_128_aligned ((__m128i*) dst);
4168 unpack_565_128_4x128 (xmm_dst,
4169 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4171 m = *((uint32_t*)mask);
4176 xmm_mask = unpack_32_1x128 (m);
4177 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4180 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4182 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4183 &xmm_mask_lo, &xmm_mask_hi);
4185 in_over_2x128 (&xmm_src, &xmm_src,
4186 &xmm_alpha, &xmm_alpha,
4187 &xmm_mask_lo, &xmm_mask_hi,
4188 &xmm_dst0, &xmm_dst1);
4191 m = *((uint32_t*)mask);
4196 xmm_mask = unpack_32_1x128 (m);
4197 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4200 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4202 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4203 &xmm_mask_lo, &xmm_mask_hi);
4204 in_over_2x128 (&xmm_src, &xmm_src,
4205 &xmm_alpha, &xmm_alpha,
4206 &xmm_mask_lo, &xmm_mask_hi,
4207 &xmm_dst2, &xmm_dst3);
4211 (__m128i*)dst, pack_565_4x128_128 (
4212 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4225 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4226 mmx_dest = expand565_16_1x64 (d);
4228 *dst = pack_565_32_16 (
4231 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4242 /* -----------------------------------------------------------------------
4243 * composite_over_pixbuf_0565
4247 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4249 pixman_image_t * src_image,
4250 pixman_image_t * mask_image,
4251 pixman_image_t * dst_image,
4261 uint16_t *dst_line, *dst, d;
4262 uint32_t *src_line, *src, s;
4263 int dst_stride, src_stride;
4265 uint32_t opaque, zero;
4268 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4269 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4271 PIXMAN_IMAGE_GET_LINE (
4272 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4273 PIXMAN_IMAGE_GET_LINE (
4274 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4279 * I copy the code from MMX one and keep the fixme.
4280 * If it's a problem there, probably is a problem here.
4282 assert (src_image->drawable == mask_image->drawable);
4288 dst_line += dst_stride;
4290 src_line += src_stride;
4293 /* call prefetch hint to optimize cache load*/
4294 cache_prefetch ((__m128i*)src);
4295 cache_prefetch ((__m128i*)dst);
4297 while (w && (unsigned long)dst & 15)
4302 ms = unpack_32_1x64 (s);
4304 *dst++ = pack_565_32_16 (
4306 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4310 /* call prefetch hint to optimize cache load*/
4311 cache_prefetch ((__m128i*)src);
4312 cache_prefetch ((__m128i*)dst);
4316 /* fill cache line with next memory */
4317 cache_prefetch_next ((__m128i*)src);
4318 cache_prefetch_next ((__m128i*)dst);
4321 xmm_src = load_128_unaligned ((__m128i*)src);
4322 xmm_dst = load_128_aligned ((__m128i*)dst);
4324 opaque = is_opaque (xmm_src);
4325 zero = is_zero (xmm_src);
4327 unpack_565_128_4x128 (xmm_dst,
4328 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4329 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4331 /* preload next round*/
4332 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4336 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4337 &xmm_dst0, &xmm_dst1);
4341 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4342 &xmm_dst0, &xmm_dst1);
4346 opaque = is_opaque (xmm_src);
4347 zero = is_zero (xmm_src);
4349 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4353 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4354 &xmm_dst2, &xmm_dst3);
4358 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4359 &xmm_dst2, &xmm_dst3);
4363 (__m128i*)dst, pack_565_4x128_128 (
4364 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4376 ms = unpack_32_1x64 (s);
4378 *dst++ = pack_565_32_16 (
4380 over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4388 /* -------------------------------------------------------------------------
4389 * composite_over_pixbuf_8888
4393 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4395 pixman_image_t * src_image,
4396 pixman_image_t * mask_image,
4397 pixman_image_t * dst_image,
4407 uint32_t *dst_line, *dst, d;
4408 uint32_t *src_line, *src, s;
4409 int dst_stride, src_stride;
4411 uint32_t opaque, zero;
4413 __m128i xmm_src_lo, xmm_src_hi;
4414 __m128i xmm_dst_lo, xmm_dst_hi;
4416 PIXMAN_IMAGE_GET_LINE (
4417 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4418 PIXMAN_IMAGE_GET_LINE (
4419 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4424 * I copy the code from MMX one and keep the fixme.
4425 * If it's a problem there, probably is a problem here.
4427 assert (src_image->drawable == mask_image->drawable);
4433 dst_line += dst_stride;
4435 src_line += src_stride;
4438 /* call prefetch hint to optimize cache load*/
4439 cache_prefetch ((__m128i*)src);
4440 cache_prefetch ((__m128i*)dst);
4442 while (w && (unsigned long)dst & 15)
4447 *dst++ = pack_1x64_32 (
4448 over_rev_non_pre_1x64 (
4449 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4454 /* call prefetch hint to optimize cache load*/
4455 cache_prefetch ((__m128i*)src);
4456 cache_prefetch ((__m128i*)dst);
4460 /* fill cache line with next memory */
4461 cache_prefetch_next ((__m128i*)src);
4462 cache_prefetch_next ((__m128i*)dst);
4464 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4466 opaque = is_opaque (xmm_src_hi);
4467 zero = is_zero (xmm_src_hi);
4469 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4473 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4474 &xmm_dst_lo, &xmm_dst_hi);
4477 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4481 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4483 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4485 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4486 &xmm_dst_lo, &xmm_dst_hi);
4489 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4502 *dst++ = pack_1x64_32 (
4503 over_rev_non_pre_1x64 (
4504 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4513 /* -------------------------------------------------------------------------------------------------
4514 * composite_over_n_8888_0565_ca
4518 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4520 pixman_image_t * src_image,
4521 pixman_image_t * mask_image,
4522 pixman_image_t * dst_image,
4533 uint16_t *dst_line, *dst, d;
4534 uint32_t *mask_line, *mask, m;
4535 int dst_stride, mask_stride;
4539 __m128i xmm_src, xmm_alpha;
4540 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4541 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4543 __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4545 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4550 PIXMAN_IMAGE_GET_LINE (
4551 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4552 PIXMAN_IMAGE_GET_LINE (
4553 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4555 xmm_src = expand_pixel_32_1x128 (src);
4556 xmm_alpha = expand_alpha_1x128 (xmm_src);
4557 mmx_src = _mm_movepi64_pi64 (xmm_src);
4558 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4565 mask_line += mask_stride;
4566 dst_line += dst_stride;
4568 /* call prefetch hint to optimize cache load*/
4569 cache_prefetch ((__m128i*)mask);
4570 cache_prefetch ((__m128i*)dst);
4572 while (w && ((unsigned long)dst & 15))
4574 m = *(uint32_t *) mask;
4579 mmx_mask = unpack_32_1x64 (m);
4580 mmx_dest = expand565_16_1x64 (d);
4582 *dst = pack_565_32_16 (
4585 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4593 /* call prefetch hint to optimize cache load*/
4594 cache_prefetch ((__m128i*)mask);
4595 cache_prefetch ((__m128i*)dst);
4599 /* fill cache line with next memory */
4600 cache_prefetch_next ((__m128i*)mask);
4601 cache_prefetch_next ((__m128i*)dst);
4604 xmm_mask = load_128_unaligned ((__m128i*)mask);
4605 xmm_dst = load_128_aligned ((__m128i*)dst);
4607 pack_cmp = _mm_movemask_epi8 (
4608 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4610 unpack_565_128_4x128 (xmm_dst,
4611 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4612 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4614 /* preload next round */
4615 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4617 /* preload next round */
4618 if (pack_cmp != 0xffff)
4620 in_over_2x128 (&xmm_src, &xmm_src,
4621 &xmm_alpha, &xmm_alpha,
4622 &xmm_mask_lo, &xmm_mask_hi,
4623 &xmm_dst0, &xmm_dst1);
4627 pack_cmp = _mm_movemask_epi8 (
4628 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4630 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4632 if (pack_cmp != 0xffff)
4634 in_over_2x128 (&xmm_src, &xmm_src,
4635 &xmm_alpha, &xmm_alpha,
4636 &xmm_mask_lo, &xmm_mask_hi,
4637 &xmm_dst2, &xmm_dst3);
4641 (__m128i*)dst, pack_565_4x128_128 (
4642 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4651 m = *(uint32_t *) mask;
4656 mmx_mask = unpack_32_1x64 (m);
4657 mmx_dest = expand565_16_1x64 (d);
4659 *dst = pack_565_32_16 (
4662 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4674 /* -----------------------------------------------------------------------
4675 * composite_in_n_8_8
4679 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4681 pixman_image_t * src_image,
4682 pixman_image_t * mask_image,
4683 pixman_image_t * dst_image,
4693 uint8_t *dst_line, *dst;
4694 uint8_t *mask_line, *mask;
4695 int dst_stride, mask_stride;
4701 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4702 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4704 PIXMAN_IMAGE_GET_LINE (
4705 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4706 PIXMAN_IMAGE_GET_LINE (
4707 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4709 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4715 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4720 dst_line += dst_stride;
4722 mask_line += mask_stride;
4725 /* call prefetch hint to optimize cache load*/
4726 cache_prefetch ((__m128i*)mask);
4727 cache_prefetch ((__m128i*)dst);
4729 while (w && ((unsigned long)dst & 15))
4731 m = (uint32_t) *mask++;
4732 d = (uint32_t) *dst;
4734 *dst++ = (uint8_t) pack_1x64_32 (
4736 pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4737 unpack_32_1x64 (m)),
4738 unpack_32_1x64 (d)));
4742 /* call prefetch hint to optimize cache load*/
4743 cache_prefetch ((__m128i*)mask);
4744 cache_prefetch ((__m128i*)dst);
4748 /* fill cache line with next memory */
4749 cache_prefetch_next ((__m128i*)mask);
4750 cache_prefetch_next ((__m128i*)dst);
4752 xmm_mask = load_128_unaligned ((__m128i*)mask);
4753 xmm_dst = load_128_aligned ((__m128i*)dst);
4755 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4756 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4758 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4759 &xmm_mask_lo, &xmm_mask_hi,
4760 &xmm_mask_lo, &xmm_mask_hi);
4762 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4763 &xmm_dst_lo, &xmm_dst_hi,
4764 &xmm_dst_lo, &xmm_dst_hi);
4767 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4776 m = (uint32_t) *mask++;
4777 d = (uint32_t) *dst;
4779 *dst++ = (uint8_t) pack_1x64_32 (
4782 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4783 unpack_32_1x64 (d)));
4791 /* ---------------------------------------------------------------------------
4796 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4798 pixman_image_t * src_image,
4799 pixman_image_t * mask_image,
4800 pixman_image_t * dst_image,
4810 uint8_t *dst_line, *dst;
4811 uint8_t *src_line, *src;
4812 int src_stride, dst_stride;
4816 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4817 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4819 PIXMAN_IMAGE_GET_LINE (
4820 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4821 PIXMAN_IMAGE_GET_LINE (
4822 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4827 dst_line += dst_stride;
4829 src_line += src_stride;
4832 /* call prefetch hint to optimize cache load*/
4833 cache_prefetch ((__m128i*)src);
4834 cache_prefetch ((__m128i*)dst);
4836 while (w && ((unsigned long)dst & 15))
4838 s = (uint32_t) *src++;
4839 d = (uint32_t) *dst;
4841 *dst++ = (uint8_t) pack_1x64_32 (
4843 unpack_32_1x64 (s), unpack_32_1x64 (d)));
4847 /* call prefetch hint to optimize cache load*/
4848 cache_prefetch ((__m128i*)src);
4849 cache_prefetch ((__m128i*)dst);
4853 /* fill cache line with next memory */
4854 cache_prefetch_next ((__m128i*)src);
4855 cache_prefetch_next ((__m128i*)dst);
4857 xmm_src = load_128_unaligned ((__m128i*)src);
4858 xmm_dst = load_128_aligned ((__m128i*)dst);
4860 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4861 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4863 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4864 &xmm_dst_lo, &xmm_dst_hi,
4865 &xmm_dst_lo, &xmm_dst_hi);
4868 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4877 s = (uint32_t) *src++;
4878 d = (uint32_t) *dst;
4880 *dst++ = (uint8_t) pack_1x64_32 (
4881 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4889 /* -------------------------------------------------------------------------
4890 * composite_add_8888_8_8
4894 sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
4896 pixman_image_t * src_image,
4897 pixman_image_t * mask_image,
4898 pixman_image_t * dst_image,
4908 uint8_t *dst_line, *dst;
4909 uint8_t *mask_line, *mask;
4910 int dst_stride, mask_stride;
4917 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4918 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4920 PIXMAN_IMAGE_GET_LINE (
4921 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4922 PIXMAN_IMAGE_GET_LINE (
4923 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4925 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4931 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4936 dst_line += dst_stride;
4938 mask_line += mask_stride;
4941 /* call prefetch hint to optimize cache load*/
4942 cache_prefetch ((__m128i*)mask);
4943 cache_prefetch ((__m128i*)dst);
4945 while (w && ((unsigned long)dst & 15))
4947 m = (uint32_t) *mask++;
4948 d = (uint32_t) *dst;
4950 *dst++ = (uint8_t) pack_1x64_32 (
4953 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4954 unpack_32_1x64 (d)));
4958 /* call prefetch hint to optimize cache load*/
4959 cache_prefetch ((__m128i*)mask);
4960 cache_prefetch ((__m128i*)dst);
4964 /* fill cache line with next memory */
4965 cache_prefetch_next ((__m128i*)mask);
4966 cache_prefetch_next ((__m128i*)dst);
4968 xmm_mask = load_128_unaligned ((__m128i*)mask);
4969 xmm_dst = load_128_aligned ((__m128i*)dst);
4971 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4972 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4974 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4975 &xmm_mask_lo, &xmm_mask_hi,
4976 &xmm_mask_lo, &xmm_mask_hi);
4978 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4979 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4982 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4991 m = (uint32_t) *mask++;
4992 d = (uint32_t) *dst;
4994 *dst++ = (uint8_t) pack_1x64_32 (
4997 _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4998 unpack_32_1x64 (d)));
5007 /* ----------------------------------------------------------------------
5008 * composite_add_8000_8000
5012 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5014 pixman_image_t * src_image,
5015 pixman_image_t * mask_image,
5016 pixman_image_t * dst_image,
5026 uint8_t *dst_line, *dst;
5027 uint8_t *src_line, *src;
5028 int dst_stride, src_stride;
5032 PIXMAN_IMAGE_GET_LINE (
5033 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5034 PIXMAN_IMAGE_GET_LINE (
5035 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5042 /* call prefetch hint to optimize cache load*/
5043 cache_prefetch ((__m128i*)src);
5044 cache_prefetch ((__m128i*)dst);
5046 dst_line += dst_stride;
5047 src_line += src_stride;
5051 while (w && (unsigned long)dst & 3)
5053 t = (*dst) + (*src++);
5054 *dst++ = t | (0 - (t >> 8));
5058 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5068 t = (*dst) + (*src++);
5069 *dst++ = t | (0 - (t >> 8));
5077 /* ---------------------------------------------------------------------
5078 * composite_add_8888_8888
5081 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5083 pixman_image_t * src_image,
5084 pixman_image_t * mask_image,
5085 pixman_image_t * dst_image,
5095 uint32_t *dst_line, *dst;
5096 uint32_t *src_line, *src;
5097 int dst_stride, src_stride;
5099 PIXMAN_IMAGE_GET_LINE (
5100 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5101 PIXMAN_IMAGE_GET_LINE (
5102 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5107 dst_line += dst_stride;
5109 src_line += src_stride;
5111 core_combine_add_u_sse2 (dst, src, NULL, width);
5117 /* -------------------------------------------------------------------------------------------------
5118 * sse2_composite_copy_area
5121 static pixman_bool_t
5122 pixman_blt_sse2 (uint32_t *src_bits,
5135 uint8_t * src_bytes;
5136 uint8_t * dst_bytes;
5139 if (src_bpp != dst_bpp)
5144 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5145 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5146 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5147 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5148 byte_width = 2 * width;
5152 else if (src_bpp == 32)
5154 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5155 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5156 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5157 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5158 byte_width = 4 * width;
5167 cache_prefetch ((__m128i*)src_bytes);
5168 cache_prefetch ((__m128i*)dst_bytes);
5173 uint8_t *s = src_bytes;
5174 uint8_t *d = dst_bytes;
5175 src_bytes += src_stride;
5176 dst_bytes += dst_stride;
5179 cache_prefetch_next ((__m128i*)s);
5180 cache_prefetch_next ((__m128i*)d);
5182 while (w >= 2 && ((unsigned long)d & 3))
5184 *(uint16_t *)d = *(uint16_t *)s;
5190 while (w >= 4 && ((unsigned long)d & 15))
5192 *(uint32_t *)d = *(uint32_t *)s;
5199 cache_prefetch_next ((__m128i*)s);
5200 cache_prefetch_next ((__m128i*)d);
5204 __m128i xmm0, xmm1, xmm2, xmm3;
5206 /* 128 bytes ahead */
5207 cache_prefetch (((__m128i*)s) + 8);
5208 cache_prefetch (((__m128i*)d) + 8);
5210 xmm0 = load_128_unaligned ((__m128i*)(s));
5211 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5212 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5213 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5215 save_128_aligned ((__m128i*)(d), xmm0);
5216 save_128_aligned ((__m128i*)(d + 16), xmm1);
5217 save_128_aligned ((__m128i*)(d + 32), xmm2);
5218 save_128_aligned ((__m128i*)(d + 48), xmm3);
5225 cache_prefetch_next ((__m128i*)s);
5226 cache_prefetch_next ((__m128i*)d);
5230 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5237 cache_prefetch_next ((__m128i*)s);
5238 cache_prefetch_next ((__m128i*)d);
5242 *(uint32_t *)d = *(uint32_t *)s;
5251 *(uint16_t *)d = *(uint16_t *)s;
5264 sse2_composite_copy_area (pixman_implementation_t *imp,
5266 pixman_image_t * src_image,
5267 pixman_image_t * mask_image,
5268 pixman_image_t * dst_image,
5278 pixman_blt_sse2 (src_image->bits.bits,
5279 dst_image->bits.bits,
5280 src_image->bits.rowstride,
5281 dst_image->bits.rowstride,
5282 PIXMAN_FORMAT_BPP (src_image->bits.format),
5283 PIXMAN_FORMAT_BPP (dst_image->bits.format),
5284 src_x, src_y, dest_x, dest_y, width, height);
5288 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
5290 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5292 pixman_image_t * src_image,
5293 pixman_image_t * mask_image,
5294 pixman_image_t * dst_image,
5304 uint32_t *src, *src_line, s;
5305 uint32_t *dst, *dst_line, d;
5306 uint8_t *mask, *mask_line;
5308 int src_stride, mask_stride, dst_stride;
5311 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5312 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5313 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5315 PIXMAN_IMAGE_GET_LINE (
5316 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5317 PIXMAN_IMAGE_GET_LINE (
5318 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5319 PIXMAN_IMAGE_GET_LINE (
5320 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5325 src_line += src_stride;
5327 dst_line += dst_stride;
5329 mask_line += mask_stride;
5333 /* call prefetch hint to optimize cache load*/
5334 cache_prefetch ((__m128i*)src);
5335 cache_prefetch ((__m128i*)dst);
5336 cache_prefetch ((__m128i*)mask);
5338 while (w && (unsigned long)dst & 15)
5340 s = 0xff000000 | *src++;
5341 m = (uint32_t) *mask++;
5344 __m64 ms = unpack_32_1x64 (s);
5348 ms = in_over_1x64 (ms,
5350 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5351 unpack_32_1x64 (d));
5354 *dst++ = pack_1x64_32 (ms);
5358 /* call prefetch hint to optimize cache load*/
5359 cache_prefetch ((__m128i*)src);
5360 cache_prefetch ((__m128i*)dst);
5361 cache_prefetch ((__m128i*)mask);
5365 /* fill cache line with next memory */
5366 cache_prefetch_next ((__m128i*)src);
5367 cache_prefetch_next ((__m128i*)dst);
5368 cache_prefetch_next ((__m128i*)mask);
5370 m = *(uint32_t*) mask;
5371 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5373 if (m == 0xffffffff)
5375 save_128_aligned ((__m128i*)dst, xmm_src);
5379 xmm_dst = load_128_aligned ((__m128i*)dst);
5381 xmm_mask = _mm_unpacklo_epi16 (
5382 unpack_32_1x128 (m), _mm_setzero_si128 ());
5384 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5385 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5386 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5388 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
5389 &xmm_mask_lo, &xmm_mask_hi);
5391 in_over_2x128 (xmm_src_lo, xmm_src_hi,
5392 mask_00ff, mask_00ff,
5393 xmm_mask_lo, xmm_mask_hi,
5394 &xmm_dst_lo, &xmm_dst_hi);
5397 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5408 m = (uint32_t) *mask++;
5412 s = 0xff000000 | *src;
5422 *dst = pack_1x64_32 (
5426 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5427 unpack_32_1x64 (d)));
5443 static const pixman_fast_path_t sse2_fast_paths[] =
5445 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 },
5446 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 },
5447 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 },
5448 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 },
5449 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 },
5450 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
5451 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
5452 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
5453 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
5454 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
5455 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
5456 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5457 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 },
5458 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5459 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 },
5461 /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
5462 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5463 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5464 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
5465 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
5467 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5468 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5469 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5470 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
5471 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5472 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5473 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5474 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
5475 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5476 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5477 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5478 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5479 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5480 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5481 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5482 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5483 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5484 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5485 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5486 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5487 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5488 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
5489 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5490 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5491 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5492 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
5493 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5494 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5496 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 },
5497 { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 },
5498 { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 },
5499 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_8888_8_8, 0 },
5501 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5502 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 },
5503 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5504 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 },
5505 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 },
5506 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 },
5507 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5508 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5509 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
5510 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
5511 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 },
5512 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 },
5514 { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 },
5515 { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 },
5521 * Work around GCC bug causing crashes in Mozilla with SSE2
5523 * When using -msse, gcc generates movdqa instructions assuming that
5524 * the stack is 16 byte aligned. Unfortunately some applications, such
5525 * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
5526 * causes the movdqa instructions to fail.
5528 * The __force_align_arg_pointer__ makes gcc generate a prologue that
5529 * realigns the stack pointer to 16 bytes.
5531 * On x86-64 this is not necessary because the standard ABI already
5532 * calls for a 16 byte aligned stack.
5534 * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
5536 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5537 __attribute__((__force_align_arg_pointer__))
5540 sse2_composite (pixman_implementation_t *imp,
5542 pixman_image_t * src,
5543 pixman_image_t * mask,
5544 pixman_image_t * dest,
5554 if (_pixman_run_fast_path (sse2_fast_paths, imp,
5555 op, src, mask, dest,
5564 _pixman_implementation_composite (imp->delegate, op,
5572 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5573 __attribute__((__force_align_arg_pointer__))
5575 static pixman_bool_t
5576 sse2_blt (pixman_implementation_t *imp,
5577 uint32_t * src_bits,
5578 uint32_t * dst_bits,
5590 if (!pixman_blt_sse2 (
5591 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5592 src_x, src_y, dst_x, dst_y, width, height))
5595 return _pixman_implementation_blt (
5597 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5598 src_x, src_y, dst_x, dst_y, width, height);
5604 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5605 __attribute__((__force_align_arg_pointer__))
5607 static pixman_bool_t
5608 sse2_fill (pixman_implementation_t *imp,
5618 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5620 return _pixman_implementation_fill (
5621 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5627 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5628 __attribute__((__force_align_arg_pointer__))
5630 pixman_implementation_t *
5631 _pixman_implementation_create_sse2 (void)
5633 pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5634 pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5636 /* SSE2 constants */
5637 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5638 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5639 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5640 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5641 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5642 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5643 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5644 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5645 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
5646 mask_0080 = create_mask_16_128 (0x0080);
5647 mask_00ff = create_mask_16_128 (0x00ff);
5648 mask_0101 = create_mask_16_128 (0x0101);
5649 mask_ffff = create_mask_16_128 (0xffff);
5650 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5651 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5654 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5655 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5657 mask_x0080 = create_mask_16_64 (0x0080);
5658 mask_x00ff = create_mask_16_64 (0x00ff);
5659 mask_x0101 = create_mask_16_64 (0x0101);
5660 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5664 /* Set up function pointers */
5666 /* SSE code patch for fbcompose.c */
5667 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5668 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5669 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5670 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5671 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5672 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5673 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5674 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5675 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5676 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5678 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5680 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5681 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5682 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5683 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5684 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5685 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5686 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5687 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5688 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5689 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5690 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5692 imp->composite = sse2_composite;
5693 imp->blt = sse2_blt;
5694 imp->fill = sse2_fill;
5699 #endif /* USE_SSE2 */