2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
41 /* -------------------------------------------------------------------------------------------------
45 static __m64 mask_x0080;
46 static __m64 mask_x00ff;
47 static __m64 mask_x0101;
48 static __m64 mask_x_alpha;
50 static __m64 mask_x565_rgb;
51 static __m64 mask_x565_unpack;
53 static __m128i mask_0080;
54 static __m128i mask_00ff;
55 static __m128i mask_0101;
56 static __m128i mask_ffff;
57 static __m128i mask_ff000000;
58 static __m128i mask_alpha;
60 static __m128i mask_565_r;
61 static __m128i mask_565_g1, mask_565_g2;
62 static __m128i mask_565_b;
63 static __m128i mask_red;
64 static __m128i mask_green;
65 static __m128i mask_blue;
67 static __m128i mask_565_fix_rb;
68 static __m128i mask_565_fix_g;
70 /* -------------------------------------------------------------------------------------------------
73 static force_inline __m128i
74 unpack_32_1x128 (uint32_t data)
76 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
79 static force_inline void
80 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
82 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
83 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
86 static force_inline __m128i
87 unpack_565to8888 (__m128i lo)
89 __m128i r, g, b, rb, t;
91 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
92 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
93 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
95 rb = _mm_or_si128 (r, b);
96 t = _mm_and_si128 (rb, mask_565_fix_rb);
97 t = _mm_srli_epi32 (t, 5);
98 rb = _mm_or_si128 (rb, t);
100 t = _mm_and_si128 (g, mask_565_fix_g);
101 t = _mm_srli_epi32 (t, 6);
102 g = _mm_or_si128 (g, t);
104 return _mm_or_si128 (rb, g);
107 static force_inline void
108 unpack_565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
112 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
113 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
115 lo = unpack_565to8888 (lo);
116 hi = unpack_565to8888 (hi);
118 unpack_128_2x128 (lo, data0, data1);
119 unpack_128_2x128 (hi, data2, data3);
122 static force_inline uint16_t
123 pack_565_32_16 (uint32_t pixel)
125 return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
128 static force_inline __m128i
129 pack_2x128_128 (__m128i lo, __m128i hi)
131 return _mm_packus_epi16 (lo, hi);
134 static force_inline __m128i
135 pack_565_2x128_128 (__m128i lo, __m128i hi)
138 __m128i r, g1, g2, b;
140 data = pack_2x128_128 ( lo, hi );
142 r = _mm_and_si128 (data , mask_565_r);
143 g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), mask_565_g1);
144 g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), mask_565_g2);
145 b = _mm_and_si128 (_mm_srli_epi32 (data , 3), mask_565_b);
147 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
150 static force_inline __m128i
151 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
153 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), pack_565_2x128_128 (*xmm2, *xmm3));
156 static force_inline int
157 is_opaque (__m128i x)
159 __m128i ffs = _mm_cmpeq_epi8 (x, x);
160 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
163 static force_inline int
166 return _mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) == 0xffff;
169 static force_inline int
170 is_transparent (__m128i x)
172 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) & 0x8888) == 0x8888;
175 static force_inline __m128i
176 expand_pixel_32_1x128 (uint32_t data)
178 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
181 static force_inline __m128i
182 expand_alpha_1x128 (__m128i data)
184 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
187 static force_inline void
188 expand_alpha_2x128 (__m128i data_lo, __m128i data_hi, __m128i* alpha_lo, __m128i* alpha_hi)
192 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE(3, 3, 3, 3));
193 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE(3, 3, 3, 3));
194 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
195 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
198 static force_inline void
199 expand_alpha_rev_2x128 (__m128i data_lo, __m128i data_hi, __m128i* alpha_lo, __m128i* alpha_hi)
203 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE(0, 0, 0, 0));
204 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE(0, 0, 0, 0));
205 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
206 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
209 static force_inline void
210 pix_multiply_2x128 (__m128i* data_lo, __m128i* data_hi, __m128i* alpha_lo, __m128i* alpha_hi, __m128i* ret_lo, __m128i* ret_hi)
214 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
215 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
216 lo = _mm_adds_epu16 (lo, mask_0080);
217 hi = _mm_adds_epu16 (hi, mask_0080);
218 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
219 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
222 static force_inline void
223 pix_add_multiply_2x128 (__m128i* src_lo, __m128i* src_hi, __m128i* alpha_dst_lo, __m128i* alpha_dst_hi,
224 __m128i* dst_lo, __m128i* dst_hi, __m128i* alpha_src_lo, __m128i* alpha_src_hi,
225 __m128i* ret_lo, __m128i* ret_hi)
228 __m128i mul_lo, mul_hi;
230 lo = _mm_mullo_epi16 (*src_lo, *alpha_dst_lo);
231 hi = _mm_mullo_epi16 (*src_hi, *alpha_dst_hi);
232 mul_lo = _mm_mullo_epi16 (*dst_lo, *alpha_src_lo);
233 mul_hi = _mm_mullo_epi16 (*dst_hi, *alpha_src_hi);
234 lo = _mm_adds_epu16 (lo, mask_0080);
235 hi = _mm_adds_epu16 (hi, mask_0080);
236 lo = _mm_adds_epu16 (lo, mul_lo);
237 hi = _mm_adds_epu16 (hi, mul_hi);
238 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
239 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
242 static force_inline void
243 negate_2x128 (__m128i data_lo, __m128i data_hi, __m128i* neg_lo, __m128i* neg_hi)
245 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
246 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
249 static force_inline void
250 invert_colors_2x128 (__m128i data_lo, __m128i data_hi, __m128i* inv_lo, __m128i* inv_hi)
254 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE(3, 0, 1, 2));
255 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE(3, 0, 1, 2));
256 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
257 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
260 static force_inline void
261 over_2x128 (__m128i* src_lo, __m128i* src_hi, __m128i* alpha_lo, __m128i* alpha_hi, __m128i* dst_lo, __m128i* dst_hi)
265 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
267 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
269 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
270 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
273 static force_inline void
274 over_rev_non_pre_2x128 (__m128i src_lo, __m128i src_hi, __m128i* dst_lo, __m128i* dst_hi)
277 __m128i alpha_lo, alpha_hi;
279 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
281 lo = _mm_or_si128 (alpha_lo, mask_alpha);
282 hi = _mm_or_si128 (alpha_hi, mask_alpha);
284 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
286 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
288 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
291 static force_inline void
292 in_over_2x128 (__m128i* src_lo, __m128i* src_hi, __m128i* alpha_lo, __m128i* alpha_hi,
293 __m128i* mask_lo, __m128i* mask_hi, __m128i* dst_lo, __m128i* dst_hi)
298 pix_multiply_2x128 ( src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
299 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
301 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
304 static force_inline void
305 cache_prefetch (__m128i* addr)
307 _mm_prefetch (addr, _MM_HINT_T0);
310 static force_inline void
311 cache_prefetch_next (__m128i* addr)
313 _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
316 /* load 4 pixels from a 16-byte boundary aligned address */
317 static force_inline __m128i
318 load_128_aligned (__m128i* src)
320 return _mm_load_si128 (src);
323 /* load 4 pixels from a unaligned address */
324 static force_inline __m128i
325 load_128_unaligned (const __m128i* src)
327 return _mm_loadu_si128 (src);
330 /* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
331 static force_inline void
332 save128write_combining (__m128i* dst, __m128i data)
334 _mm_stream_si128 (dst, data);
337 /* save 4 pixels on a 16-byte boundary aligned address */
338 static force_inline void
339 save_128_aligned (__m128i* dst, __m128i data)
341 _mm_store_si128 (dst, data);
344 /* save 4 pixels on a unaligned address */
345 static force_inline void
346 save_128_unaligned (__m128i* dst, __m128i data)
348 _mm_storeu_si128 (dst, data);
351 /* -------------------------------------------------------------------------------------------------
355 static force_inline __m64
356 unpack_32_1x64 (uint32_t data)
358 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
361 static force_inline __m64
362 expand_alpha_1x64 (__m64 data)
364 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
367 static force_inline __m64
368 expand_alpha_rev_1x64 (__m64 data)
370 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
373 static force_inline __m64
374 expand_pixel_8_1x64 (uint8_t data)
376 return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
379 static force_inline __m64
380 pix_multiply_1x64 (__m64 data, __m64 alpha)
382 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
387 static force_inline __m64
388 pix_add_multiply_1x64 (__m64* src, __m64* alpha_dst, __m64* dst, __m64* alpha_src)
390 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alpha_dst),
392 _mm_mullo_pi16 (*dst, *alpha_src)),
396 static force_inline __m64
397 negate_1x64 (__m64 data)
399 return _mm_xor_si64 (data, mask_x00ff);
402 static force_inline __m64
403 invert_colors_1x64 (__m64 data)
405 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
408 static force_inline __m64
409 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
411 return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
414 static force_inline __m64
415 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
417 return over_1x64 (pix_multiply_1x64 (*src, *mask),
418 pix_multiply_1x64 (*alpha, *mask),
422 static force_inline __m64
423 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
425 __m64 alpha = expand_alpha_1x64 (src);
427 return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
428 _mm_or_si64 (alpha, mask_x_alpha)),
433 static force_inline uint32_t
434 pack_1x64_32( __m64 data )
436 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
439 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
443 * --- Expanding 565 in the low word ---
445 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
446 * m = m & (01f0003f001f);
447 * m = m * (008404100840);
450 * Note the trick here - the top word is shifted by another nibble to
451 * avoid it bumping into the middle word
453 static force_inline __m64
454 expand565_16_1x64 (uint16_t pixel)
459 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
461 t1 = _mm_slli_si64 (p, 36 - 11);
462 t2 = _mm_slli_si64 (p, 16 - 5);
464 p = _mm_or_si64 (t1, p);
465 p = _mm_or_si64 (t2, p);
466 p = _mm_and_si64 (p, mask_x565_rgb);
467 p = _mm_mullo_pi16 (p, mask_x565_unpack);
469 return _mm_srli_pi16 (p, 8);
472 /* -------------------------------------------------------------------------------------------------
473 * Compose Core transformations
475 static force_inline uint32_t
476 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
489 ms = unpack_32_1x64 (src);
490 return pack_1x64_32 (over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
496 static force_inline uint32_t
497 combine1 (const uint32_t *ps, const uint32_t *pm)
505 mm = unpack_32_1x64 (*pm);
506 mm = expand_alpha_1x64 (mm);
508 ms = unpack_32_1x64 (s);
509 ms = pix_multiply_1x64 (ms, mm);
511 s = pack_1x64_32 (ms);
517 static force_inline __m128i
518 combine4 (const __m128i *ps, const __m128i *pm)
520 __m128i xmm_src_lo, xmm_src_hi;
521 __m128i xmm_msk_lo, xmm_msk_hi;
526 xmm_msk_lo = load_128_unaligned (pm);
528 if (is_transparent (xmm_msk_lo))
529 return _mm_setzero_si128 ();
532 s = load_128_unaligned (ps);
536 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
537 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
539 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
541 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_msk_lo, &xmm_msk_hi, &xmm_src_lo, &xmm_src_hi);
543 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
549 static force_inline void
550 core_combine_over_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
554 __m128i xmm_dst_lo, xmm_dst_hi;
555 __m128i xmm_src_lo, xmm_src_hi;
556 __m128i xmm_alpha_lo, xmm_alpha_hi;
558 /* call prefetch hint to optimize cache load*/
559 cache_prefetch ((__m128i*)ps);
560 cache_prefetch ((__m128i*)pd);
561 cache_prefetch ((__m128i*)pm);
563 /* Align dst on a 16-byte boundary */
565 ((unsigned long)pd & 15))
568 s = combine1 (ps, pm);
570 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
577 /* call prefetch hint to optimize cache load*/
578 cache_prefetch ((__m128i*)ps);
579 cache_prefetch ((__m128i*)pd);
580 cache_prefetch ((__m128i*)pm);
584 /* fill cache line with next memory */
585 cache_prefetch_next ((__m128i*)ps);
586 cache_prefetch_next ((__m128i*)pd);
587 cache_prefetch_next ((__m128i*)pm);
589 /* I'm loading unaligned because I'm not sure about the address alignment. */
590 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
592 if (is_opaque (xmm_src_hi))
594 save_128_aligned ((__m128i*)pd, xmm_src_hi);
596 else if (!is_zero (xmm_src_hi))
598 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
600 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
601 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
603 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
605 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi);
607 /* rebuid the 4 pixel data and save*/
608 save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
621 s = combine1 (ps, pm);
623 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
631 static force_inline void
632 core_combine_over_reverse_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
636 __m128i xmm_dst_lo, xmm_dst_hi;
637 __m128i xmm_src_lo, xmm_src_hi;
638 __m128i xmm_alpha_lo, xmm_alpha_hi;
640 /* call prefetch hint to optimize cache load*/
641 cache_prefetch ((__m128i*)ps);
642 cache_prefetch ((__m128i*)pd);
643 cache_prefetch ((__m128i*)pm);
645 /* Align dst on a 16-byte boundary */
647 ((unsigned long)pd & 15))
650 s = combine1 (ps, pm);
652 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
659 /* call prefetch hint to optimize cache load*/
660 cache_prefetch ((__m128i*)ps);
661 cache_prefetch ((__m128i*)pd);
662 cache_prefetch ((__m128i*)pm);
666 /* fill cache line with next memory */
667 cache_prefetch_next ((__m128i*)ps);
668 cache_prefetch_next ((__m128i*)pd);
669 cache_prefetch_next ((__m128i*)pm);
671 /* I'm loading unaligned because I'm not sure about the address alignment. */
672 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
673 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
675 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
676 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
678 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi);
680 over_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_src_lo, &xmm_src_hi);
682 /* rebuid the 4 pixel data and save*/
683 save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_src_lo, xmm_src_hi));
695 s = combine1 (ps, pm);
697 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
705 static force_inline uint32_t
706 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
708 uint32_t maska = src >> 24;
714 else if (maska != 0xff)
716 return pack_1x64_32(pix_multiply_1x64 (unpack_32_1x64 (dst), expand_alpha_1x64 (unpack_32_1x64 (src))));
722 static force_inline void
723 core_combine_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
727 __m128i xmm_src_lo, xmm_src_hi;
728 __m128i xmm_dst_lo, xmm_dst_hi;
730 /* call prefetch hint to optimize cache load*/
731 cache_prefetch ((__m128i*)ps);
732 cache_prefetch ((__m128i*)pd);
733 cache_prefetch ((__m128i*)pm);
735 while (w && ((unsigned long) pd & 15))
737 s = combine1 (ps, pm);
740 *pd++ = core_combine_in_u_pixelsse2 (d, s);
747 /* call prefetch hint to optimize cache load*/
748 cache_prefetch ((__m128i*)ps);
749 cache_prefetch ((__m128i*)pd);
750 cache_prefetch ((__m128i*)pm);
754 /* fill cache line with next memory */
755 cache_prefetch_next ((__m128i*)ps);
756 cache_prefetch_next ((__m128i*)pd);
757 cache_prefetch_next ((__m128i*)pm);
759 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
760 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
762 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
763 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
765 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
766 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
768 save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
779 s = combine1 (ps, pm);
782 *pd++ = core_combine_in_u_pixelsse2 (d, s);
790 static force_inline void
791 core_combine_reverse_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
795 __m128i xmm_src_lo, xmm_src_hi;
796 __m128i xmm_dst_lo, xmm_dst_hi;
798 /* call prefetch hint to optimize cache load*/
799 cache_prefetch ((__m128i*)ps);
800 cache_prefetch ((__m128i*)pd);
801 cache_prefetch ((__m128i*)pm);
803 while (w && ((unsigned long) pd & 15))
805 s = combine1 (ps, pm);
808 *pd++ = core_combine_in_u_pixelsse2 (s, d);
815 /* call prefetch hint to optimize cache load*/
816 cache_prefetch ((__m128i*)ps);
817 cache_prefetch ((__m128i*)pd);
818 cache_prefetch ((__m128i*)pm);
822 /* fill cache line with next memory */
823 cache_prefetch_next ((__m128i*)ps);
824 cache_prefetch_next ((__m128i*)pd);
825 cache_prefetch_next ((__m128i*)pm);
827 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
828 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
830 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
831 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
833 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
834 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi);
836 save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
847 s = combine1 (ps, pm);
850 *pd++ = core_combine_in_u_pixelsse2 (s, d);
858 static force_inline void
859 core_combine_reverse_out_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
861 /* call prefetch hint to optimize cache load*/
862 cache_prefetch ((__m128i*)ps);
863 cache_prefetch ((__m128i*)pd);
864 cache_prefetch ((__m128i*)pm);
866 while (w && ((unsigned long) pd & 15))
868 uint32_t s = combine1 (ps, pm);
871 *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (s)))));
878 /* call prefetch hint to optimize cache load*/
879 cache_prefetch ((__m128i*)ps);
880 cache_prefetch ((__m128i*)pd);
881 cache_prefetch ((__m128i*)pm);
885 __m128i xmm_src_lo, xmm_src_hi;
886 __m128i xmm_dst_lo, xmm_dst_hi;
888 /* fill cache line with next memory */
889 cache_prefetch_next ((__m128i*)ps);
890 cache_prefetch_next ((__m128i*)pd);
891 cache_prefetch_next ((__m128i*)pm);
893 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
894 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
896 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
897 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
899 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
900 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
902 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi);
904 save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
915 uint32_t s = combine1 (ps, pm);
918 *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (s)))));
926 static force_inline void
927 core_combine_out_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
929 /* call prefetch hint to optimize cache load*/
930 cache_prefetch ((__m128i*)ps);
931 cache_prefetch ((__m128i*)pd);
932 cache_prefetch ((__m128i*)pm);
934 while (w && ((unsigned long) pd & 15))
936 uint32_t s = combine1 (ps, pm);
939 *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
946 /* call prefetch hint to optimize cache load*/
947 cache_prefetch ((__m128i*)ps);
948 cache_prefetch ((__m128i*)pd);
949 cache_prefetch ((__m128i*)pm);
953 __m128i xmm_src_lo, xmm_src_hi;
954 __m128i xmm_dst_lo, xmm_dst_hi;
956 /* fill cache line with next memory */
957 cache_prefetch_next ((__m128i*)ps);
958 cache_prefetch_next ((__m128i*)pd);
959 cache_prefetch_next ((__m128i*)pm);
961 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
962 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
964 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
965 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
967 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
968 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
970 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
972 save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
983 uint32_t s = combine1 (ps, pm);
986 *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
994 static force_inline uint32_t
995 core_combine_atop_u_pixel_sse2 (uint32_t src, uint32_t dst)
997 __m64 s = unpack_32_1x64 (src);
998 __m64 d = unpack_32_1x64 (dst);
1000 __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1001 __m64 da = expand_alpha_1x64 (d);
1003 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1006 static force_inline void
1007 core_combine_atop_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
1011 __m128i xmm_src_lo, xmm_src_hi;
1012 __m128i xmm_dst_lo, xmm_dst_hi;
1013 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1014 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1016 /* call prefetch hint to optimize cache load*/
1017 cache_prefetch ((__m128i*)ps);
1018 cache_prefetch ((__m128i*)pd);
1019 cache_prefetch ((__m128i*)pm);
1021 while (w && ((unsigned long) pd & 15))
1023 s = combine1 (ps, pm);
1026 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1033 /* call prefetch hint to optimize cache load*/
1034 cache_prefetch ((__m128i*)ps);
1035 cache_prefetch ((__m128i*)pd);
1036 cache_prefetch ((__m128i*)pm);
1040 /* fill cache line with next memory */
1041 cache_prefetch_next ((__m128i*)ps);
1042 cache_prefetch_next ((__m128i*)pd);
1043 cache_prefetch_next ((__m128i*)pm);
1045 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1046 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1048 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1049 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1051 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1052 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1054 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1056 pix_add_multiply_2x128 ( &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1057 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1058 &xmm_dst_lo, &xmm_dst_hi );
1060 save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1071 s = combine1 (ps, pm);
1074 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1082 static force_inline uint32_t
1083 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, uint32_t dst)
1085 __m64 s = unpack_32_1x64 (src);
1086 __m64 d = unpack_32_1x64 (dst);
1088 __m64 sa = expand_alpha_1x64 (s);
1089 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1091 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1094 static force_inline void
1095 core_combine_reverse_atop_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
1099 __m128i xmm_src_lo, xmm_src_hi;
1100 __m128i xmm_dst_lo, xmm_dst_hi;
1101 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1102 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1104 /* call prefetch hint to optimize cache load*/
1105 cache_prefetch ((__m128i*)ps);
1106 cache_prefetch ((__m128i*)pd);
1107 cache_prefetch ((__m128i*)pm);
1109 while (w && ((unsigned long) pd & 15))
1111 s = combine1 (ps, pm);
1114 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1121 /* call prefetch hint to optimize cache load*/
1122 cache_prefetch ((__m128i*)ps);
1123 cache_prefetch ((__m128i*)pd);
1124 cache_prefetch ((__m128i*)pm);
1128 /* fill cache line with next memory */
1129 cache_prefetch_next ((__m128i*)ps);
1130 cache_prefetch_next ((__m128i*)pd);
1131 cache_prefetch_next ((__m128i*)pm);
1133 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1134 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1136 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1137 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1139 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1140 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1142 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1144 pix_add_multiply_2x128 ( &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1145 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1146 &xmm_dst_lo, &xmm_dst_hi );
1148 save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1159 s = combine1 (ps, pm);
1162 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1170 static force_inline uint32_t
1171 core_combine_xor_u_pixel_sse2 (uint32_t src, uint32_t dst)
1173 __m64 s = unpack_32_1x64 (src);
1174 __m64 d = unpack_32_1x64 (dst);
1176 __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1177 __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1179 return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1182 static force_inline void
1183 core_combine_xor_u_sse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mask, int width)
1188 const uint32_t* ps = src;
1189 const uint32_t* pm = mask;
1191 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1192 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1193 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1194 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1196 /* call prefetch hint to optimize cache load*/
1197 cache_prefetch ((__m128i*)ps);
1198 cache_prefetch ((__m128i*)pd);
1199 cache_prefetch ((__m128i*)pm);
1201 while (w && ((unsigned long) pd & 15))
1203 s = combine1 (ps, pm);
1206 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1213 /* call prefetch hint to optimize cache load*/
1214 cache_prefetch ((__m128i*)ps);
1215 cache_prefetch ((__m128i*)pd);
1216 cache_prefetch ((__m128i*)pm);
1220 /* fill cache line with next memory */
1221 cache_prefetch_next ((__m128i*)ps);
1222 cache_prefetch_next ((__m128i*)pd);
1223 cache_prefetch_next ((__m128i*)pm);
1225 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1226 xmm_dst = load_128_aligned ((__m128i*) pd);
1228 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1229 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1231 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1232 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1234 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1235 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1237 pix_add_multiply_2x128 ( &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1238 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1239 &xmm_dst_lo, &xmm_dst_hi );
1241 save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1252 s = combine1 (ps, pm);
1255 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1263 static force_inline void
1264 core_combine_add_u_sse2 (uint32_t* dst, const uint32_t* src, const uint32_t* mask, int width)
1269 const uint32_t* ps = src;
1270 const uint32_t* pm = mask;
1272 /* call prefetch hint to optimize cache load*/
1273 cache_prefetch ((__m128i*)ps);
1274 cache_prefetch ((__m128i*)pd);
1275 cache_prefetch ((__m128i*)pm);
1277 while (w && (unsigned long)pd & 15)
1279 s = combine1 (ps, pm);
1284 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1288 /* call prefetch hint to optimize cache load*/
1289 cache_prefetch ((__m128i*)ps);
1290 cache_prefetch ((__m128i*)pd);
1291 cache_prefetch ((__m128i*)pm);
1297 /* fill cache line with next memory */
1298 cache_prefetch_next ((__m128i*)ps);
1299 cache_prefetch_next ((__m128i*)pd);
1300 cache_prefetch_next ((__m128i*)pm);
1302 s = combine4((__m128i*)ps,(__m128i*)pm);
1304 save_128_aligned( (__m128i*)pd,
1305 _mm_adds_epu8( s, load_128_aligned ((__m128i*)pd)) );
1315 s = combine1 (ps, pm);
1318 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1324 static force_inline uint32_t
1325 core_combine_saturate_u_pixel_sse2 (uint32_t src, uint32_t dst)
1327 __m64 ms = unpack_32_1x64 (src);
1328 __m64 md = unpack_32_1x64 (dst);
1329 uint32_t sa = src >> 24;
1330 uint32_t da = ~dst >> 24;
1334 ms = pix_multiply_1x64 (ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8(da, sa) << 24)));
1337 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1340 static force_inline void
1341 core_combine_saturate_u_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1346 __m128i xmm_src, xmm_dst;
1348 /* call prefetch hint to optimize cache load*/
1349 cache_prefetch ((__m128i*)ps);
1350 cache_prefetch ((__m128i*)pd);
1351 cache_prefetch ((__m128i*)pm);
1353 while (w && (unsigned long)pd & 15)
1355 s = combine1 (ps, pm);
1357 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1364 /* call prefetch hint to optimize cache load*/
1365 cache_prefetch ((__m128i*)ps);
1366 cache_prefetch ((__m128i*)pd);
1367 cache_prefetch ((__m128i*)pm);
1371 /* fill cache line with next memory */
1372 cache_prefetch_next ((__m128i*)ps);
1373 cache_prefetch_next ((__m128i*)pd);
1374 cache_prefetch_next ((__m128i*)pm);
1376 xmm_dst = load_128_aligned ((__m128i*)pd);
1377 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1379 pack_cmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmm_src, 24),
1380 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1382 /* if some alpha src is grater than respective ~alpha dst */
1385 s = combine1 (ps++, pm);
1387 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1391 s = combine1 (ps++, pm);
1393 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1397 s = combine1 (ps++, pm);
1399 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1403 s = combine1 (ps++, pm);
1405 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1411 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1424 s = combine1 (ps, pm);
1426 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1433 static force_inline void
1434 core_combine_src_c_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1438 __m128i xmm_src_lo, xmm_src_hi;
1439 __m128i xmm_mask_lo, xmm_mask_hi;
1440 __m128i xmm_dst_lo, xmm_dst_hi;
1442 /* call prefetch hint to optimize cache load*/
1443 cache_prefetch ((__m128i*)ps);
1444 cache_prefetch ((__m128i*)pd);
1445 cache_prefetch ((__m128i*)pm);
1447 while (w && (unsigned long)pd & 15)
1451 *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1455 /* call prefetch hint to optimize cache load*/
1456 cache_prefetch ((__m128i*)ps);
1457 cache_prefetch ((__m128i*)pd);
1458 cache_prefetch ((__m128i*)pm);
1462 /* fill cache line with next memory */
1463 cache_prefetch_next ((__m128i*)ps);
1464 cache_prefetch_next ((__m128i*)pd);
1465 cache_prefetch_next ((__m128i*)pm);
1467 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1468 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1470 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1471 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1473 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
1475 save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1487 *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1492 static force_inline uint32_t
1493 core_combine_over_c_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
1495 __m64 s = unpack_32_1x64 (src);
1496 __m64 expAlpha = expand_alpha_1x64 (s);
1497 __m64 unpk_mask = unpack_32_1x64 (mask);
1498 __m64 unpk_dst = unpack_32_1x64 (dst);
1500 return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1503 static force_inline void
1504 core_combine_over_c_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1508 __m128i xmm_alpha_lo, xmm_alpha_hi;
1509 __m128i xmm_src_lo, xmm_src_hi;
1510 __m128i xmm_dst_lo, xmm_dst_hi;
1511 __m128i xmm_mask_lo, xmm_mask_hi;
1513 /* call prefetch hint to optimize cache load*/
1514 cache_prefetch ((__m128i*)ps);
1515 cache_prefetch ((__m128i*)pd);
1516 cache_prefetch ((__m128i*)pm);
1518 while (w && (unsigned long)pd & 15)
1524 *pd++ = core_combine_over_c_pixel_sse2 (s, m, d);
1528 /* call prefetch hint to optimize cache load*/
1529 cache_prefetch ((__m128i*)ps);
1530 cache_prefetch ((__m128i*)pd);
1531 cache_prefetch ((__m128i*)pm);
1535 /* fill cache line with next memory */
1536 cache_prefetch_next ((__m128i*)ps);
1537 cache_prefetch_next ((__m128i*)pd);
1538 cache_prefetch_next ((__m128i*)pm);
1540 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1541 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1542 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1544 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1545 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1546 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1548 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
1550 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
1552 save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1566 *pd++ = core_combine_over_c_pixel_sse2 (s, m, d);
1571 static force_inline uint32_t
1572 core_combine_over_reverse_c_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
1574 __m64 d = unpack_32_1x64 (dst);
1576 return pack_1x64_32(over_1x64 (d, expand_alpha_1x64 (d), pix_multiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
1579 static force_inline void
1580 core_combine_over_reverse_c_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1584 __m128i xmm_alpha_lo, xmm_alpha_hi;
1585 __m128i xmm_src_lo, xmm_src_hi;
1586 __m128i xmm_dst_lo, xmm_dst_hi;
1587 __m128i xmm_mask_lo, xmm_mask_hi;
1589 /* call prefetch hint to optimize cache load*/
1590 cache_prefetch ((__m128i*)ps);
1591 cache_prefetch ((__m128i*)pd);
1592 cache_prefetch ((__m128i*)pm);
1594 while (w && (unsigned long)pd & 15)
1600 *pd++ = core_combine_over_reverse_c_pixel_sse2 (s, m, d);
1604 /* call prefetch hint to optimize cache load*/
1605 cache_prefetch ((__m128i*)ps);
1606 cache_prefetch ((__m128i*)pd);
1607 cache_prefetch ((__m128i*)pm);
1611 /* fill cache line with next memory */
1612 cache_prefetch_next ((__m128i*)ps);
1613 cache_prefetch_next ((__m128i*)pd);
1614 cache_prefetch_next ((__m128i*)pm);
1616 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1617 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1618 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1620 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1621 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1622 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1624 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi);
1625 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1627 over_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask_lo, &xmm_mask_hi);
1629 save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1643 *pd++ = core_combine_over_reverse_c_pixel_sse2 (s, m, d);
1648 static force_inline void
1649 core_combine_in_c_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1653 __m128i xmm_alpha_lo, xmm_alpha_hi;
1654 __m128i xmm_src_lo, xmm_src_hi;
1655 __m128i xmm_dst_lo, xmm_dst_hi;
1656 __m128i xmm_mask_lo, xmm_mask_hi;
1658 /* call prefetch hint to optimize cache load*/
1659 cache_prefetch ((__m128i*)ps);
1660 cache_prefetch ((__m128i*)pd);
1661 cache_prefetch ((__m128i*)pm);
1663 while (w && (unsigned long)pd & 15)
1669 *pd++ = pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1670 expand_alpha_1x64 (unpack_32_1x64 (d))));
1674 /* call prefetch hint to optimize cache load*/
1675 cache_prefetch ((__m128i*)ps);
1676 cache_prefetch ((__m128i*)pd);
1677 cache_prefetch ((__m128i*)pm);
1681 /* fill cache line with next memory */
1682 cache_prefetch_next ((__m128i*)ps);
1683 cache_prefetch_next ((__m128i*)pd);
1684 cache_prefetch_next ((__m128i*)pm);
1686 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1687 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1688 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1690 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1691 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1692 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1694 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi);
1695 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
1697 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi);
1699 save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1713 *pd++ = pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1714 expand_alpha_1x64 (unpack_32_1x64 (d))));
1719 static force_inline void
1720 core_combine_in_reverse_c_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1724 __m128i xmm_alpha_lo, xmm_alpha_hi;
1725 __m128i xmm_src_lo, xmm_src_hi;
1726 __m128i xmm_dst_lo, xmm_dst_hi;
1727 __m128i xmm_mask_lo, xmm_mask_hi;
1729 /* call prefetch hint to optimize cache load*/
1730 cache_prefetch ((__m128i*)ps);
1731 cache_prefetch ((__m128i*)pd);
1732 cache_prefetch ((__m128i*)pm);
1734 while (w && (unsigned long)pd & 15)
1740 *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d),
1741 pix_multiply_1x64 (unpack_32_1x64 (m),
1742 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1746 /* call prefetch hint to optimize cache load*/
1747 cache_prefetch ((__m128i*)ps);
1748 cache_prefetch ((__m128i*)pd);
1749 cache_prefetch ((__m128i*)pm);
1753 /* fill cache line with next memory */
1754 cache_prefetch_next ((__m128i*)ps);
1755 cache_prefetch_next ((__m128i*)pd);
1756 cache_prefetch_next ((__m128i*)pm);
1758 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1759 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1760 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1762 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1763 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1764 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1766 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
1767 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_alpha_lo, &xmm_alpha_hi);
1769 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi);
1771 save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1785 *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d),
1786 pix_multiply_1x64 (unpack_32_1x64 (m),
1787 expand_alpha_1x64 (unpack_32_1x64 (s)))));
1792 static force_inline void
1793 core_combine_out_c_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1797 __m128i xmm_alpha_lo, xmm_alpha_hi;
1798 __m128i xmm_src_lo, xmm_src_hi;
1799 __m128i xmm_dst_lo, xmm_dst_hi;
1800 __m128i xmm_mask_lo, xmm_mask_hi;
1802 /* call prefetch hint to optimize cache load*/
1803 cache_prefetch ((__m128i*)ps);
1804 cache_prefetch ((__m128i*)pd);
1805 cache_prefetch ((__m128i*)pm);
1807 while (w && (unsigned long)pd & 15)
1813 *pd++ = pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1814 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1818 /* call prefetch hint to optimize cache load*/
1819 cache_prefetch ((__m128i*)ps);
1820 cache_prefetch ((__m128i*)pd);
1821 cache_prefetch ((__m128i*)pm);
1825 /* fill cache line with next memory */
1826 cache_prefetch_next ((__m128i*)ps);
1827 cache_prefetch_next ((__m128i*)pd);
1828 cache_prefetch_next ((__m128i*)pm);
1830 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1831 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1832 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1834 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1835 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1836 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1838 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi);
1839 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, &xmm_alpha_lo, &xmm_alpha_hi);
1841 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
1842 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi);
1844 save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1858 *pd++ = pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1859 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
1864 static force_inline void
1865 core_combine_out_reverse_c_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1869 __m128i xmm_alpha_lo, xmm_alpha_hi;
1870 __m128i xmm_src_lo, xmm_src_hi;
1871 __m128i xmm_dst_lo, xmm_dst_hi;
1872 __m128i xmm_mask_lo, xmm_mask_hi;
1874 /* call prefetch hint to optimize cache load*/
1875 cache_prefetch ((__m128i*)ps);
1876 cache_prefetch ((__m128i*)pd);
1877 cache_prefetch ((__m128i*)pm);
1879 while (w && (unsigned long)pd & 15)
1885 *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d),
1886 negate_1x64 (pix_multiply_1x64 (unpack_32_1x64 (m),
1887 expand_alpha_1x64 (unpack_32_1x64 (s))))));
1891 /* call prefetch hint to optimize cache load*/
1892 cache_prefetch ((__m128i*)ps);
1893 cache_prefetch ((__m128i*)pd);
1894 cache_prefetch ((__m128i*)pm);
1898 /* fill cache line with next memory */
1899 cache_prefetch_next ((__m128i*)ps);
1900 cache_prefetch_next ((__m128i*)pd);
1901 cache_prefetch_next ((__m128i*)pm);
1903 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1904 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1905 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1907 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1908 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1909 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1911 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
1913 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask_lo, &xmm_mask_hi);
1915 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1917 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
1919 save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1933 *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d),
1934 negate_1x64 (pix_multiply_1x64 (unpack_32_1x64 (m),
1935 expand_alpha_1x64 (unpack_32_1x64 (s))))));
1940 static force_inline uint32_t
1941 core_combine_atop_c_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
1943 __m64 m = unpack_32_1x64 (mask);
1944 __m64 s = unpack_32_1x64 (src);
1945 __m64 d = unpack_32_1x64 (dst);
1946 __m64 sa = expand_alpha_1x64 (s);
1947 __m64 da = expand_alpha_1x64 (d);
1949 s = pix_multiply_1x64 (s, m);
1950 m = negate_1x64 (pix_multiply_1x64 (m, sa));
1952 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
1955 static force_inline void
1956 core_combine_atop_c_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1960 __m128i xmm_src_lo, xmm_src_hi;
1961 __m128i xmm_dst_lo, xmm_dst_hi;
1962 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1963 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1964 __m128i xmm_mask_lo, xmm_mask_hi;
1966 /* call prefetch hint to optimize cache load*/
1967 cache_prefetch ((__m128i*)ps);
1968 cache_prefetch ((__m128i*)pd);
1969 cache_prefetch ((__m128i*)pm);
1971 while (w && (unsigned long)pd & 15)
1977 *pd++ = core_combine_atop_c_pixel_sse2 (s, m, d);
1981 /* call prefetch hint to optimize cache load*/
1982 cache_prefetch ((__m128i*)ps);
1983 cache_prefetch ((__m128i*)pd);
1984 cache_prefetch ((__m128i*)pm);
1988 /* fill cache line with next memory */
1989 cache_prefetch_next ((__m128i*)ps);
1990 cache_prefetch_next ((__m128i*)pd);
1991 cache_prefetch_next ((__m128i*)pm);
1993 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1994 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1995 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1997 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1998 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1999 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2001 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2002 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2004 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi);
2005 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_mask_lo, &xmm_mask_hi);
2007 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2009 pix_add_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2010 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2011 &xmm_dst_lo, &xmm_dst_hi);
2013 save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2027 *pd++ = core_combine_atop_c_pixel_sse2 (s, m, d);
2032 static force_inline uint32_t
2033 core_combine_reverse_atop_c_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
2035 __m64 m = unpack_32_1x64 (mask);
2036 __m64 s = unpack_32_1x64 (src);
2037 __m64 d = unpack_32_1x64 (dst);
2039 __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2040 __m64 sa = expand_alpha_1x64 (s);
2042 s = pix_multiply_1x64 (s, m);
2043 m = pix_multiply_1x64 (m, sa);
2045 return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2048 static force_inline void
2049 core_combine_reverse_atop_c_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2053 __m128i xmm_src_lo, xmm_src_hi;
2054 __m128i xmm_dst_lo, xmm_dst_hi;
2055 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2056 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2057 __m128i xmm_mask_lo, xmm_mask_hi;
2059 /* call prefetch hint to optimize cache load*/
2060 cache_prefetch ((__m128i*)ps);
2061 cache_prefetch ((__m128i*)pd);
2062 cache_prefetch ((__m128i*)pm);
2064 while (w && (unsigned long)pd & 15)
2070 *pd++ = core_combine_reverse_atop_c_pixel_sse2 (s, m, d);
2074 /* call prefetch hint to optimize cache load*/
2075 cache_prefetch ((__m128i*)ps);
2076 cache_prefetch ((__m128i*)pd);
2077 cache_prefetch ((__m128i*)pm);
2081 /* fill cache line with next memory */
2082 cache_prefetch_next ((__m128i*)ps);
2083 cache_prefetch_next ((__m128i*)pd);
2084 cache_prefetch_next ((__m128i*)pm);
2086 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2087 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2088 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2090 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2091 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2092 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2094 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2095 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2097 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi);
2098 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_mask_lo, &xmm_mask_hi);
2100 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2102 pix_add_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2103 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2104 &xmm_dst_lo, &xmm_dst_hi);
2106 save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2120 *pd++ = core_combine_reverse_atop_c_pixel_sse2 (s, m, d);
2125 static force_inline uint32_t
2126 core_combine_xor_c_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
2128 __m64 a = unpack_32_1x64 (mask);
2129 __m64 s = unpack_32_1x64 (src);
2130 __m64 d = unpack_32_1x64 (dst);
2132 __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (a, expand_alpha_1x64 (s)));
2133 __m64 dest = pix_multiply_1x64 (s, a);
2134 __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2136 return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2142 static force_inline void
2143 core_combine_xor_c_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2147 __m128i xmm_src_lo, xmm_src_hi;
2148 __m128i xmm_dst_lo, xmm_dst_hi;
2149 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2150 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2151 __m128i xmm_mask_lo, xmm_mask_hi;
2153 /* call prefetch hint to optimize cache load*/
2154 cache_prefetch ((__m128i*)ps);
2155 cache_prefetch ((__m128i*)pd);
2156 cache_prefetch ((__m128i*)pm);
2158 while (w && (unsigned long)pd & 15)
2164 *pd++ = core_combine_xor_c_pixel_sse2 (s, m, d);
2168 /* call prefetch hint to optimize cache load*/
2169 cache_prefetch ((__m128i*)ps);
2170 cache_prefetch ((__m128i*)pd);
2171 cache_prefetch ((__m128i*)pm);
2175 /* fill cache line with next memory */
2176 cache_prefetch_next ((__m128i*)ps);
2177 cache_prefetch_next ((__m128i*)pd);
2178 cache_prefetch_next ((__m128i*)pm);
2180 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2181 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2182 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2184 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2185 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2186 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2188 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2189 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2191 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi);
2192 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_mask_lo, &xmm_mask_hi);
2194 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2195 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2197 pix_add_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2198 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2199 &xmm_dst_lo, &xmm_dst_hi);
2201 save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2215 *pd++ = core_combine_xor_c_pixel_sse2 (s, m, d);
2220 static force_inline void
2221 core_combine_add_c_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2225 __m128i xmm_src_lo, xmm_src_hi;
2226 __m128i xmm_dst_lo, xmm_dst_hi;
2227 __m128i xmm_mask_lo, xmm_mask_hi;
2229 /* call prefetch hint to optimize cache load*/
2230 cache_prefetch ((__m128i*)ps);
2231 cache_prefetch ((__m128i*)pd);
2232 cache_prefetch ((__m128i*)pm);
2234 while (w && (unsigned long)pd & 15)
2240 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2241 unpack_32_1x64 (m)),
2242 unpack_32_1x64 (d)));
2246 /* call prefetch hint to optimize cache load*/
2247 cache_prefetch ((__m128i*)ps);
2248 cache_prefetch ((__m128i*)pd);
2249 cache_prefetch ((__m128i*)pm);
2253 /* fill cache line with next memory */
2254 cache_prefetch_next ((__m128i*)ps);
2255 cache_prefetch_next ((__m128i*)pd);
2256 cache_prefetch_next ((__m128i*)pm);
2258 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2259 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2260 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2262 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2263 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2264 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2266 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi);
2268 save_128_aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2269 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2283 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2284 unpack_32_1x64 (m)),
2285 unpack_32_1x64 (d)));
2290 /* -------------------------------------------------------------------------------------------------
2291 * fb_compose_setup_sSE2
2293 static force_inline __m64
2294 create_mask_16_64 (uint16_t mask)
2296 return _mm_set1_pi16 (mask);
2299 static force_inline __m128i
2300 create_mask_16_128 (uint16_t mask)
2302 return _mm_set1_epi16 (mask);
2305 static force_inline __m64
2306 create_mask_2x32_64 (uint32_t mask0, uint32_t mask1)
2308 return _mm_set_pi32 (mask0, mask1);
2311 static force_inline __m128i
2312 create_mask_2x32_128 (uint32_t mask0, uint32_t mask1)
2314 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2317 /* SSE2 code patch for fbcompose.c */
2320 sse2combine_over_u (pixman_implementation_t *imp, pixman_op_t op,
2321 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2323 core_combine_over_u_sse2 (dst, src, mask, width);
2328 sse2combine_over_reverse_u (pixman_implementation_t *imp, pixman_op_t op,
2329 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2331 core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2336 sse2combine_in_u (pixman_implementation_t *imp, pixman_op_t op,
2337 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2339 core_combine_in_u_sse2 (dst, src, mask, width);
2344 sse2combine_in_reverse_u (pixman_implementation_t *imp, pixman_op_t op,
2345 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2347 core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2352 sse2combine_out_u (pixman_implementation_t *imp, pixman_op_t op,
2353 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2355 core_combine_out_u_sse2 (dst, src, mask, width);
2360 sse2combine_out_reverse_u (pixman_implementation_t *imp, pixman_op_t op,
2361 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2363 core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2368 sse2combine_atop_u (pixman_implementation_t *imp, pixman_op_t op,
2369 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2371 core_combine_atop_u_sse2 (dst, src, mask, width);
2376 sse2combine_atop_reverse_u (pixman_implementation_t *imp, pixman_op_t op,
2377 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2379 core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2384 sse2combine_xor_u (pixman_implementation_t *imp, pixman_op_t op,
2385 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2387 core_combine_xor_u_sse2 (dst, src, mask, width);
2392 sse2combine_add_u (pixman_implementation_t *imp, pixman_op_t op,
2393 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2395 core_combine_add_u_sse2 (dst, src, mask, width);
2400 sse2combine_saturate_u (pixman_implementation_t *imp, pixman_op_t op,
2401 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2403 core_combine_saturate_u_sse2 (dst, src, mask, width);
2408 sse2combine_src_c (pixman_implementation_t *imp, pixman_op_t op,
2409 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2411 core_combine_src_c_sse2 (dst, src, mask, width);
2416 sse2combine_over_c (pixman_implementation_t *imp, pixman_op_t op,
2417 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2419 core_combine_over_c_sse2 (dst, src, mask, width);
2424 sse2combine_over_reverse_c (pixman_implementation_t *imp, pixman_op_t op,
2425 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2427 core_combine_over_reverse_c_sse2 (dst, src, mask, width);
2432 sse2combine_in_c (pixman_implementation_t *imp, pixman_op_t op,
2433 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2435 core_combine_in_c_sse2 (dst, src, mask, width);
2440 sse2combine_in_reverse_c (pixman_implementation_t *imp, pixman_op_t op,
2441 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2443 core_combine_in_reverse_c_sse2 (dst, src, mask, width);
2448 sse2combine_out_c (pixman_implementation_t *imp, pixman_op_t op,
2449 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2451 core_combine_out_c_sse2 (dst, src, mask, width);
2456 sse2combine_out_reverse_c (pixman_implementation_t *imp, pixman_op_t op,
2457 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2459 core_combine_out_reverse_c_sse2 (dst, src, mask, width);
2464 sse2combine_atop_c (pixman_implementation_t *imp, pixman_op_t op,
2465 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2467 core_combine_atop_c_sse2 (dst, src, mask, width);
2472 sse2combine_atop_reverse_c (pixman_implementation_t *imp, pixman_op_t op,
2473 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2475 core_combine_reverse_atop_c_sse2 (dst, src, mask, width);
2480 sse2combine_xor_c (pixman_implementation_t *imp, pixman_op_t op,
2481 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2483 core_combine_xor_c_sse2 (dst, src, mask, width);
2488 sse2combine_add_c (pixman_implementation_t *imp, pixman_op_t op,
2489 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2491 core_combine_add_c_sse2 (dst, src, mask, width);
2495 /* -------------------------------------------------------------------------------------------------
2496 * fast_composite_over_n_8888
2500 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2502 pixman_image_t * src_image,
2503 pixman_image_t * mask_image,
2504 pixman_image_t * dst_image,
2515 uint32_t *dst_line, *dst, d;
2518 __m128i xmm_src, xmm_alpha;
2519 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2521 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
2526 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2528 xmm_src = expand_pixel_32_1x128 (src);
2529 xmm_alpha = expand_alpha_1x128 (xmm_src);
2535 /* call prefetch hint to optimize cache load*/
2536 cache_prefetch ((__m128i*)dst);
2538 dst_line += dst_stride;
2541 while (w && (unsigned long)dst & 15)
2544 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2545 _mm_movepi64_pi64 (xmm_alpha),
2546 unpack_32_1x64 (d)));
2550 cache_prefetch ((__m128i*)dst);
2554 /* fill cache line with next memory */
2555 cache_prefetch_next ((__m128i*)dst);
2557 xmm_dst = load_128_aligned ((__m128i*)dst);
2559 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2561 over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_dst_lo, &xmm_dst_hi);
2563 /* rebuid the 4 pixel data and save*/
2564 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2573 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2574 _mm_movepi64_pi64 (xmm_alpha),
2575 unpack_32_1x64 (d)));
2583 /* -------------------------------------------------------------------------------------------------
2584 * fast_composite_over_n_0565
2587 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2589 pixman_image_t * src_image,
2590 pixman_image_t * mask_image,
2591 pixman_image_t * dst_image,
2602 uint16_t *dst_line, *dst, d;
2605 __m128i xmm_src, xmm_alpha;
2606 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2608 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
2613 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2615 xmm_src = expand_pixel_32_1x128 (src);
2616 xmm_alpha = expand_alpha_1x128 (xmm_src);
2622 /* call prefetch hint to optimize cache load*/
2623 cache_prefetch ((__m128i*)dst);
2625 dst_line += dst_stride;
2628 while (w && (unsigned long)dst & 15)
2632 *dst++ = pack_565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2633 _mm_movepi64_pi64 (xmm_alpha),
2634 expand565_16_1x64 (d))));
2638 /* call prefetch hint to optimize cache load*/
2639 cache_prefetch ((__m128i*)dst);
2643 /* fill cache line with next memory */
2644 cache_prefetch_next ((__m128i*)dst);
2646 xmm_dst = load_128_aligned ((__m128i*)dst);
2648 unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2650 over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_dst0, &xmm_dst1);
2651 over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_dst2, &xmm_dst3);
2653 xmm_dst = pack_565_4x128_128 (&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2654 save_128_aligned ((__m128i*)dst, xmm_dst);
2663 *dst++ = pack_565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2664 _mm_movepi64_pi64 (xmm_alpha),
2665 expand565_16_1x64 (d))));
2672 /* -------------------------------------------------------------------------------------------------
2673 * fast_composite_over_n_8888_8888_ca
2677 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2679 pixman_image_t * src_image,
2680 pixman_image_t * mask_image,
2681 pixman_image_t * dst_image,
2692 uint32_t *dst_line, d;
2693 uint32_t *mask_line, m;
2695 int dst_stride, mask_stride;
2697 __m128i xmm_src, xmm_alpha;
2698 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2699 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2701 __m64 mmsrc_x, mmx_alpha, mmmask_x, mmdest_x;
2703 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
2708 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2709 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2711 xmm_src = _mm_unpacklo_epi8 (create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2712 xmm_alpha = expand_alpha_1x128 (xmm_src);
2713 mmsrc_x = _mm_movepi64_pi64 (xmm_src);
2714 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
2719 const uint32_t *pm = (uint32_t *)mask_line;
2720 uint32_t *pd = (uint32_t *)dst_line;
2722 dst_line += dst_stride;
2723 mask_line += mask_stride;
2725 /* call prefetch hint to optimize cache load*/
2726 cache_prefetch ((__m128i*)pd);
2727 cache_prefetch ((__m128i*)pm);
2729 while (w && (unsigned long)pd & 15)
2736 mmmask_x = unpack_32_1x64 (m);
2737 mmdest_x = unpack_32_1x64 (d);
2739 *pd = pack_1x64_32 (in_over_1x64 (&mmsrc_x,
2749 /* call prefetch hint to optimize cache load*/
2750 cache_prefetch ((__m128i*)pd);
2751 cache_prefetch ((__m128i*)pm);
2755 /* fill cache line with next memory */
2756 cache_prefetch_next ((__m128i*)pd);
2757 cache_prefetch_next ((__m128i*)pm);
2759 xmm_mask = load_128_unaligned ((__m128i*)pm);
2761 pack_cmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128()));
2763 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2764 if (pack_cmp != 0xffff)
2766 xmm_dst = load_128_aligned ((__m128i*)pd);
2768 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2769 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2771 in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
2773 save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2788 mmmask_x = unpack_32_1x64 (m);
2789 mmdest_x = unpack_32_1x64 (d);
2791 *pd = pack_1x64_32 (in_over_1x64 (&mmsrc_x,
2806 /* -------------------------------------------------------------------------------------------------
2807 * fast_composite_over_8888_n_8888
2811 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2813 pixman_image_t * src_image,
2814 pixman_image_t * mask_image,
2815 pixman_image_t * dst_image,
2825 uint32_t *dst_line, *dst;
2826 uint32_t *src_line, *src;
2829 int dst_stride, src_stride;
2832 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2833 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2834 __m128i xmm_alpha_lo, xmm_alpha_hi;
2836 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2837 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2838 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
2840 xmm_mask = create_mask_16_128 (mask >> 24);
2845 dst_line += dst_stride;
2847 src_line += src_stride;
2850 /* call prefetch hint to optimize cache load*/
2851 cache_prefetch ((__m128i*)dst);
2852 cache_prefetch ((__m128i*)src);
2854 while (w && (unsigned long)dst & 15)
2856 uint32_t s = *src++;
2859 __m64 ms = unpack_32_1x64 (s);
2860 __m64 alpha = expand_alpha_1x64 (ms);
2861 __m64 dest = _mm_movepi64_pi64 (xmm_mask);
2862 __m64 alpha_dst = unpack_32_1x64 (d);
2864 *dst++ = pack_1x64_32 (in_over_1x64 (&ms,
2872 /* call prefetch hint to optimize cache load*/
2873 cache_prefetch ((__m128i*)dst);
2874 cache_prefetch ((__m128i*)src);
2878 /* fill cache line with next memory */
2879 cache_prefetch_next ((__m128i*)dst);
2880 cache_prefetch_next ((__m128i*)src);
2882 xmm_src = load_128_unaligned ((__m128i*)src);
2883 xmm_dst = load_128_aligned ((__m128i*)dst);
2885 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2886 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2887 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
2889 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask, &xmm_mask, &xmm_dst_lo, &xmm_dst_hi);
2891 save_128_aligned( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2900 uint32_t s = *src++;
2903 __m64 ms = unpack_32_1x64 (s);
2904 __m64 alpha = expand_alpha_1x64 (ms);
2905 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
2906 __m64 dest = unpack_32_1x64 (d);
2908 *dst++ = pack_1x64_32 (in_over_1x64 (&ms,
2920 /* -------------------------------------------------------------------------------------------------
2921 * fast_composite_over_x888_n_8888
2924 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2926 pixman_image_t * src_image,
2927 pixman_image_t * mask_image,
2928 pixman_image_t * dst_image,
2938 uint32_t *dst_line, *dst;
2939 uint32_t *src_line, *src;
2941 int dst_stride, src_stride;
2944 __m128i xmm_mask, xmm_alpha;
2945 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2946 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2948 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2949 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2950 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
2952 xmm_mask = create_mask_16_128 (mask >> 24);
2953 xmm_alpha = mask_00ff;
2958 dst_line += dst_stride;
2960 src_line += src_stride;
2963 /* call prefetch hint to optimize cache load*/
2964 cache_prefetch ((__m128i*)dst);
2965 cache_prefetch ((__m128i*)src);
2967 while (w && (unsigned long)dst & 15)
2969 uint32_t s = (*src++) | 0xff000000;
2972 __m64 src = unpack_32_1x64 (s);
2973 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
2974 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
2975 __m64 dest = unpack_32_1x64 (d);
2977 *dst++ = pack_1x64_32 (in_over_1x64 (&src,
2985 /* call prefetch hint to optimize cache load*/
2986 cache_prefetch ((__m128i*)dst);
2987 cache_prefetch ((__m128i*)src);
2991 /* fill cache line with next memory */
2992 cache_prefetch_next ((__m128i*)dst);
2993 cache_prefetch_next ((__m128i*)src);
2995 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
2996 xmm_dst = load_128_aligned ((__m128i*)dst);
2998 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2999 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3001 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha, &xmm_alpha, &xmm_mask, &xmm_mask, &xmm_dst_lo, &xmm_dst_hi);
3003 save_128_aligned( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3013 uint32_t s = (*src++) | 0xff000000;
3016 __m64 src = unpack_32_1x64 (s);
3017 __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3018 __m64 mask = _mm_movepi64_pi64 (xmm_mask);
3019 __m64 dest = unpack_32_1x64 (d);
3021 *dst++ = pack_1x64_32 (in_over_1x64 (&src,
3033 /* -------------------------------------------------------------------------------------------------
3034 * fast_composite_over_8888_8888
3037 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3039 pixman_image_t * src_image,
3040 pixman_image_t * mask_image,
3041 pixman_image_t * dst_image,
3051 int dst_stride, src_stride;
3052 uint32_t *dst_line, *dst;
3053 uint32_t *src_line, *src;
3055 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3056 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3063 core_combine_over_u_sse2 (dst, src, NULL, width);
3071 /* -------------------------------------------------------------------------------------------------
3072 * fast_composite_over_8888_0565
3074 static force_inline uint16_t
3075 fast_composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3079 ms = unpack_32_1x64 (src);
3080 return pack_565_32_16( pack_1x64_32 (over_1x64 (ms,
3081 expand_alpha_1x64 (ms),
3082 expand565_16_1x64 (dst))));
3086 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3088 pixman_image_t * src_image,
3089 pixman_image_t * mask_image,
3090 pixman_image_t * dst_image,
3100 uint16_t *dst_line, *dst, d;
3101 uint32_t *src_line, *src, s;
3102 int dst_stride, src_stride;
3105 __m128i xmm_alpha_lo, xmm_alpha_hi;
3106 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3107 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3109 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3110 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3115 * I copy the code from MMX one and keep the fixme.
3116 * If it's a problem there, probably is a problem here.
3118 assert (src_image->drawable == mask_image->drawable);
3126 /* call prefetch hint to optimize cache load*/
3127 cache_prefetch ((__m128i*)src);
3128 cache_prefetch ((__m128i*)dst);
3130 dst_line += dst_stride;
3131 src_line += src_stride;
3134 /* Align dst on a 16-byte boundary */
3136 ((unsigned long)dst & 15))
3141 *dst++ = fast_composite_over_8888_0565pixel (s, d);
3145 /* call prefetch hint to optimize cache load*/
3146 cache_prefetch ((__m128i*)src);
3147 cache_prefetch ((__m128i*)dst);
3149 /* It's a 8 pixel loop */
3152 /* fill cache line with next memory */
3153 cache_prefetch_next ((__m128i*)src);
3154 cache_prefetch_next ((__m128i*)dst);
3156 /* I'm loading unaligned because I'm not sure about the address alignment. */
3157 xmm_src = load_128_unaligned ((__m128i*) src);
3158 xmm_dst = load_128_aligned ((__m128i*) dst);
3161 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3162 unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3163 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
3165 /* I'm loading next 4 pixels from memory before to optimze the memory read. */
3166 xmm_src = load_128_unaligned ((__m128i*) (src+4));
3168 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst0, &xmm_dst1);
3171 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3172 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
3174 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst2, &xmm_dst3);
3176 save_128_aligned ((__m128i*)dst, pack_565_4x128_128 (&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3188 *dst++ = fast_composite_over_8888_0565pixel (s, d);
3195 /* -------------------------------------------------------------------------------------------------
3196 * fast_composite_over_n_8_8888
3200 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3202 pixman_image_t * src_image,
3203 pixman_image_t * mask_image,
3204 pixman_image_t * dst_image,
3215 uint32_t *dst_line, *dst;
3216 uint8_t *mask_line, *mask;
3217 int dst_stride, mask_stride;
3221 __m128i xmm_src, xmm_alpha, xmm_def;
3222 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3223 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3225 __m64 mmsrc_x, mmx_alpha, mmmask_x, mmx_dest;
3227 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
3233 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3234 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3236 xmm_def = create_mask_2x32_128 (src, src);
3237 xmm_src = expand_pixel_32_1x128 (src);
3238 xmm_alpha = expand_alpha_1x128 (xmm_src);
3239 mmsrc_x = _mm_movepi64_pi64 (xmm_src);
3240 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3245 dst_line += dst_stride;
3247 mask_line += mask_stride;
3250 /* call prefetch hint to optimize cache load*/
3251 cache_prefetch ((__m128i*)mask);
3252 cache_prefetch ((__m128i*)dst);
3254 while (w && (unsigned long)dst & 15)
3256 uint8_t m = *mask++;
3261 mmmask_x = expand_pixel_8_1x64 (m);
3262 mmx_dest = unpack_32_1x64 (d);
3264 *dst = pack_1x64_32 (in_over_1x64 (&mmsrc_x,
3274 /* call prefetch hint to optimize cache load*/
3275 cache_prefetch ((__m128i*)mask);
3276 cache_prefetch ((__m128i*)dst);
3280 /* fill cache line with next memory */
3281 cache_prefetch_next ((__m128i*)mask);
3282 cache_prefetch_next ((__m128i*)dst);
3284 m = *((uint32_t*)mask);
3286 if (srca == 0xff && m == 0xffffffff)
3288 save_128_aligned ((__m128i*)dst, xmm_def);
3292 xmm_dst = load_128_aligned ((__m128i*) dst);
3293 xmm_mask = unpack_32_1x128 (m);
3294 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128());
3297 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3298 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3300 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
3302 in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
3304 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3314 uint8_t m = *mask++;
3319 mmmask_x = expand_pixel_8_1x64 (m);
3320 mmx_dest = unpack_32_1x64 (d);
3322 *dst = pack_1x64_32 (in_over_1x64 (&mmsrc_x,
3336 /* -------------------------------------------------------------------------------------------------
3337 * fast_composite_over_n_8_8888
3341 pixman_fill_sse2 (uint32_t *bits,
3350 uint32_t byte_width;
3355 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3358 if (bpp != 16 && bpp != 32)
3363 stride = stride * (int) sizeof (uint32_t) / 2;
3364 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3365 byte_width = 2 * width;
3370 stride = stride * (int) sizeof (uint32_t) / 4;
3371 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3372 byte_width = 4 * width;
3376 cache_prefetch ((__m128i*)byte_line);
3377 xmm_def = create_mask_2x32_128 (data, data);
3382 uint8_t *d = byte_line;
3383 byte_line += stride;
3387 cache_prefetch_next ((__m128i*)d);
3389 while (w >= 2 && ((unsigned long)d & 3))
3391 *(uint16_t *)d = data;
3396 while (w >= 4 && ((unsigned long)d & 15))
3398 *(uint32_t *)d = data;
3404 cache_prefetch_next ((__m128i*)d);
3408 cache_prefetch (((__m128i*)d) + 12);
3410 save_128_aligned ((__m128i*)(d), xmm_def);
3411 save_128_aligned ((__m128i*)(d+16), xmm_def);
3412 save_128_aligned ((__m128i*)(d+32), xmm_def);
3413 save_128_aligned ((__m128i*)(d+48), xmm_def);
3414 save_128_aligned ((__m128i*)(d+64), xmm_def);
3415 save_128_aligned ((__m128i*)(d+80), xmm_def);
3416 save_128_aligned ((__m128i*)(d+96), xmm_def);
3417 save_128_aligned ((__m128i*)(d+112), xmm_def);
3425 cache_prefetch (((__m128i*)d) + 8);
3427 save_128_aligned ((__m128i*)(d), xmm_def);
3428 save_128_aligned ((__m128i*)(d+16), xmm_def);
3429 save_128_aligned ((__m128i*)(d+32), xmm_def);
3430 save_128_aligned ((__m128i*)(d+48), xmm_def);
3436 cache_prefetch_next ((__m128i*)d);
3440 save_128_aligned ((__m128i*)(d), xmm_def);
3441 save_128_aligned ((__m128i*)(d+16), xmm_def);
3449 save_128_aligned ((__m128i*)(d), xmm_def);
3455 cache_prefetch_next ((__m128i*)d);
3459 *(uint32_t *)d = data;
3467 *(uint16_t *)d = data;
3478 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3480 pixman_image_t * src_image,
3481 pixman_image_t * mask_image,
3482 pixman_image_t * dst_image,
3493 uint32_t *dst_line, *dst;
3494 uint8_t *mask_line, *mask;
3495 int dst_stride, mask_stride;
3499 __m128i xmm_src, xmm_def;
3500 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3502 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
3507 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3508 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3509 dest_x, dest_y, width, height, 0);
3513 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3514 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3516 xmm_def = create_mask_2x32_128 (src, src);
3517 xmm_src = expand_pixel_32_1x128 (src);
3522 dst_line += dst_stride;
3524 mask_line += mask_stride;
3527 /* call prefetch hint to optimize cache load*/
3528 cache_prefetch ((__m128i*)mask);
3529 cache_prefetch ((__m128i*)dst);
3531 while (w && (unsigned long)dst & 15)
3533 uint8_t m = *mask++;
3537 *dst = pack_1x64_32 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3548 /* call prefetch hint to optimize cache load*/
3549 cache_prefetch ((__m128i*)mask);
3550 cache_prefetch ((__m128i*)dst);
3554 /* fill cache line with next memory */
3555 cache_prefetch_next ((__m128i*)mask);
3556 cache_prefetch_next ((__m128i*)dst);
3558 m = *((uint32_t*)mask);
3560 if (srca == 0xff && m == 0xffffffff)
3562 save_128_aligned ((__m128i*)dst, xmm_def);
3566 xmm_mask = unpack_32_1x128 (m);
3567 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128());
3570 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3572 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
3574 pix_multiply_2x128 (&xmm_src, &xmm_src, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
3576 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3580 save_128_aligned ((__m128i*)dst, _mm_setzero_si128());
3590 uint8_t m = *mask++;
3594 *dst = pack_1x64_32 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
3609 /* -------------------------------------------------------------------------------------------------
3610 * fast_composite_over_n_8_0565
3614 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3616 pixman_image_t * src_image,
3617 pixman_image_t * mask_image,
3618 pixman_image_t * dst_image,
3629 uint16_t *dst_line, *dst, d;
3630 uint8_t *mask_line, *mask;
3631 int dst_stride, mask_stride;
3634 __m64 mmsrc_x, mmx_alpha, mmmask_x, mmx_dest;
3636 __m128i xmm_src, xmm_alpha;
3637 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3638 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3640 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
3646 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3647 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3649 xmm_src = expand_pixel_32_1x128 (src);
3650 xmm_alpha = expand_alpha_1x128 (xmm_src);
3651 mmsrc_x = _mm_movepi64_pi64 (xmm_src);
3652 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3657 dst_line += dst_stride;
3659 mask_line += mask_stride;
3662 /* call prefetch hint to optimize cache load*/
3663 cache_prefetch ((__m128i*)mask);
3664 cache_prefetch ((__m128i*)dst);
3666 while (w && (unsigned long)dst & 15)
3673 mmmask_x = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
3674 mmx_dest = expand565_16_1x64 (d);
3676 *dst = pack_565_32_16 (pack_1x64_32 (in_over_1x64 (&mmsrc_x,
3686 /* call prefetch hint to optimize cache load*/
3687 cache_prefetch ((__m128i*)mask);
3688 cache_prefetch ((__m128i*)dst);
3692 /* fill cache line with next memory */
3693 cache_prefetch_next ((__m128i*)mask);
3694 cache_prefetch_next ((__m128i*)dst);
3696 xmm_dst = load_128_aligned ((__m128i*) dst);
3697 unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3699 m = *((uint32_t*)mask);
3704 xmm_mask = unpack_32_1x128 (m);
3705 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128());
3708 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3710 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
3711 in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst0, &xmm_dst1);
3714 m = *((uint32_t*)mask);
3719 xmm_mask = unpack_32_1x128 (m);
3720 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128());
3723 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3725 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
3726 in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst2, &xmm_dst3);
3729 save_128_aligned ((__m128i*)dst, pack_565_4x128_128 (&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3742 mmmask_x = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
3743 mmx_dest = expand565_16_1x64 (d);
3745 *dst = pack_565_32_16 (pack_1x64_32 (in_over_1x64 (&mmsrc_x,
3759 /* -------------------------------------------------------------------------------------------------
3760 * fast_composite_over_pixbuf_0565
3764 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3766 pixman_image_t * src_image,
3767 pixman_image_t * mask_image,
3768 pixman_image_t * dst_image,
3778 uint16_t *dst_line, *dst, d;
3779 uint32_t *src_line, *src, s;
3780 int dst_stride, src_stride;
3782 uint32_t opaque, zero;
3785 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3786 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3788 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3789 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3794 * I copy the code from MMX one and keep the fixme.
3795 * If it's a problem there, probably is a problem here.
3797 assert (src_image->drawable == mask_image->drawable);
3803 dst_line += dst_stride;
3805 src_line += src_stride;
3808 /* call prefetch hint to optimize cache load*/
3809 cache_prefetch ((__m128i*)src);
3810 cache_prefetch ((__m128i*)dst);
3812 while (w && (unsigned long)dst & 15)
3817 ms = unpack_32_1x64 (s);
3819 *dst++ = pack_565_32_16 (pack_1x64_32 (over_rev_non_pre_1x64(ms, expand565_16_1x64 (d))));
3823 /* call prefetch hint to optimize cache load*/
3824 cache_prefetch ((__m128i*)src);
3825 cache_prefetch ((__m128i*)dst);
3829 /* fill cache line with next memory */
3830 cache_prefetch_next ((__m128i*)src);
3831 cache_prefetch_next ((__m128i*)dst);
3834 xmm_src = load_128_unaligned((__m128i*)src);
3835 xmm_dst = load_128_aligned ((__m128i*)dst);
3837 opaque = is_opaque (xmm_src);
3838 zero = is_zero (xmm_src);
3840 unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3841 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3843 /* preload next round*/
3844 xmm_src = load_128_unaligned((__m128i*)(src+4));
3848 invert_colors_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst0, &xmm_dst1);
3852 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst0, &xmm_dst1);
3856 opaque = is_opaque (xmm_src);
3857 zero = is_zero (xmm_src);
3859 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3863 invert_colors_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst2, &xmm_dst3);
3867 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst2, &xmm_dst3);
3870 save_128_aligned ((__m128i*)dst, pack_565_4x128_128 (&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3882 ms = unpack_32_1x64 (s);
3884 *dst++ = pack_565_32_16 (pack_1x64_32 (over_rev_non_pre_1x64(ms, expand565_16_1x64 (d))));
3892 /* -------------------------------------------------------------------------------------------------
3893 * fast_composite_over_pixbuf_8888
3897 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3899 pixman_image_t * src_image,
3900 pixman_image_t * mask_image,
3901 pixman_image_t * dst_image,
3911 uint32_t *dst_line, *dst, d;
3912 uint32_t *src_line, *src, s;
3913 int dst_stride, src_stride;
3915 uint32_t opaque, zero;
3917 __m128i xmm_src_lo, xmm_src_hi;
3918 __m128i xmm_dst_lo, xmm_dst_hi;
3920 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3921 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3926 * I copy the code from MMX one and keep the fixme.
3927 * If it's a problem there, probably is a problem here.
3929 assert (src_image->drawable == mask_image->drawable);
3935 dst_line += dst_stride;
3937 src_line += src_stride;
3940 /* call prefetch hint to optimize cache load*/
3941 cache_prefetch ((__m128i*)src);
3942 cache_prefetch ((__m128i*)dst);
3944 while (w && (unsigned long)dst & 15)
3949 *dst++ = pack_1x64_32 (over_rev_non_pre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
3954 /* call prefetch hint to optimize cache load*/
3955 cache_prefetch ((__m128i*)src);
3956 cache_prefetch ((__m128i*)dst);
3960 /* fill cache line with next memory */
3961 cache_prefetch_next ((__m128i*)src);
3962 cache_prefetch_next ((__m128i*)dst);
3964 xmm_src_hi = load_128_unaligned((__m128i*)src);
3966 opaque = is_opaque (xmm_src_hi);
3967 zero = is_zero (xmm_src_hi);
3969 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3973 invert_colors_2x128( xmm_src_lo, xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi);
3975 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3979 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
3981 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3983 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi);
3985 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3998 *dst++ = pack_1x64_32 (over_rev_non_pre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4007 /* -------------------------------------------------------------------------------------------------
4008 * fast_composite_over_n_8888_0565_ca
4012 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4014 pixman_image_t * src_image,
4015 pixman_image_t * mask_image,
4016 pixman_image_t * dst_image,
4027 uint16_t *dst_line, *dst, d;
4028 uint32_t *mask_line, *mask, m;
4029 int dst_stride, mask_stride;
4033 __m128i xmm_src, xmm_alpha;
4034 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4035 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4037 __m64 mmsrc_x, mmx_alpha, mmmask_x, mmx_dest;
4039 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
4044 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4045 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4047 xmm_src = expand_pixel_32_1x128 (src);
4048 xmm_alpha = expand_alpha_1x128 (xmm_src);
4049 mmsrc_x = _mm_movepi64_pi64 (xmm_src);
4050 mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4057 mask_line += mask_stride;
4058 dst_line += dst_stride;
4060 /* call prefetch hint to optimize cache load*/
4061 cache_prefetch ((__m128i*)mask);
4062 cache_prefetch ((__m128i*)dst);
4064 while (w && ((unsigned long)dst & 15))
4066 m = *(uint32_t *) mask;
4071 mmmask_x = unpack_32_1x64 (m);
4072 mmx_dest = expand565_16_1x64 (d);
4074 *dst = pack_565_32_16 (pack_1x64_32 (in_over_1x64 (&mmsrc_x,
4085 /* call prefetch hint to optimize cache load*/
4086 cache_prefetch ((__m128i*)mask);
4087 cache_prefetch ((__m128i*)dst);
4091 /* fill cache line with next memory */
4092 cache_prefetch_next ((__m128i*)mask);
4093 cache_prefetch_next ((__m128i*)dst);
4096 xmm_mask = load_128_unaligned((__m128i*)mask);
4097 xmm_dst = load_128_aligned((__m128i*)dst);
4099 pack_cmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128()));
4101 unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4102 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4104 /* preload next round*/
4105 xmm_mask = load_128_unaligned((__m128i*)(mask+4));
4106 /* preload next round*/
4108 if (pack_cmp != 0xffff)
4110 in_over_2x128(&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst0, &xmm_dst1);
4114 pack_cmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128()));
4116 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4118 if (pack_cmp != 0xffff)
4120 in_over_2x128(&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst2, &xmm_dst3);
4123 save_128_aligned ((__m128i*)dst, pack_565_4x128_128 (&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4132 m = *(uint32_t *) mask;
4137 mmmask_x = unpack_32_1x64 (m);
4138 mmx_dest = expand565_16_1x64 (d);
4140 *dst = pack_565_32_16 (pack_1x64_32 (in_over_1x64 (&mmsrc_x,
4155 /* -------------------------------------------------------------------------------------------------
4156 * fast_composite_in_n_8_8
4160 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4162 pixman_image_t * src_image,
4163 pixman_image_t * mask_image,
4164 pixman_image_t * dst_image,
4174 uint8_t *dst_line, *dst;
4175 uint8_t *mask_line, *mask;
4176 int dst_stride, mask_stride;
4182 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4183 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4185 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4186 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4188 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
4194 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4199 dst_line += dst_stride;
4201 mask_line += mask_stride;
4204 /* call prefetch hint to optimize cache load*/
4205 cache_prefetch ((__m128i*)mask);
4206 cache_prefetch ((__m128i*)dst);
4208 while (w && ((unsigned long)dst & 15))
4210 m = (uint32_t) *mask++;
4211 d = (uint32_t) *dst;
4213 *dst++ = (uint8_t) pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4214 unpack_32_1x64 (d)));
4218 /* call prefetch hint to optimize cache load*/
4219 cache_prefetch ((__m128i*)mask);
4220 cache_prefetch ((__m128i*)dst);
4224 /* fill cache line with next memory */
4225 cache_prefetch_next ((__m128i*)mask);
4226 cache_prefetch_next ((__m128i*)dst);
4228 xmm_mask = load_128_unaligned((__m128i*)mask);
4229 xmm_dst = load_128_aligned((__m128i*)dst);
4231 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4232 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4234 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4235 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4237 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4246 m = (uint32_t) *mask++;
4247 d = (uint32_t) *dst;
4249 *dst++ = (uint8_t) pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4250 unpack_32_1x64 (d)));
4258 /* -------------------------------------------------------------------------------------------------
4259 * fast_composite_in_8_8
4263 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4265 pixman_image_t * src_image,
4266 pixman_image_t * mask_image,
4267 pixman_image_t * dst_image,
4277 uint8_t *dst_line, *dst;
4278 uint8_t *src_line, *src;
4279 int src_stride, dst_stride;
4283 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4284 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4286 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4287 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4292 dst_line += dst_stride;
4294 src_line += src_stride;
4297 /* call prefetch hint to optimize cache load*/
4298 cache_prefetch ((__m128i*)src);
4299 cache_prefetch ((__m128i*)dst);
4301 while (w && ((unsigned long)dst & 15))
4303 s = (uint32_t) *src++;
4304 d = (uint32_t) *dst;
4306 *dst++ = (uint8_t) pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4310 /* call prefetch hint to optimize cache load*/
4311 cache_prefetch ((__m128i*)src);
4312 cache_prefetch ((__m128i*)dst);
4316 /* fill cache line with next memory */
4317 cache_prefetch_next ((__m128i*)src);
4318 cache_prefetch_next ((__m128i*)dst);
4320 xmm_src = load_128_unaligned((__m128i*)src);
4321 xmm_dst = load_128_aligned((__m128i*)dst);
4323 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4324 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4326 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4328 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4337 s = (uint32_t) *src++;
4338 d = (uint32_t) *dst;
4340 *dst++ = (uint8_t) pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4348 /* -------------------------------------------------------------------------------------------------
4349 * fast_composite_add_8888_8_8
4353 sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
4355 pixman_image_t * src_image,
4356 pixman_image_t * mask_image,
4357 pixman_image_t * dst_image,
4367 uint8_t *dst_line, *dst;
4368 uint8_t *mask_line, *mask;
4369 int dst_stride, mask_stride;
4376 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4377 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4379 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4380 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4382 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
4388 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4393 dst_line += dst_stride;
4395 mask_line += mask_stride;
4398 /* call prefetch hint to optimize cache load*/
4399 cache_prefetch ((__m128i*)mask);
4400 cache_prefetch ((__m128i*)dst);
4402 while (w && ((unsigned long)dst & 15))
4404 m = (uint32_t) *mask++;
4405 d = (uint32_t) *dst;
4407 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4408 unpack_32_1x64 (d)));
4412 /* call prefetch hint to optimize cache load*/
4413 cache_prefetch ((__m128i*)mask);
4414 cache_prefetch ((__m128i*)dst);
4418 /* fill cache line with next memory */
4419 cache_prefetch_next ((__m128i*)mask);
4420 cache_prefetch_next ((__m128i*)dst);
4422 xmm_mask = load_128_unaligned((__m128i*)mask);
4423 xmm_dst = load_128_aligned((__m128i*)dst);
4425 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4426 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4428 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4430 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4431 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4433 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4442 m = (uint32_t) *mask++;
4443 d = (uint32_t) *dst;
4445 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4446 unpack_32_1x64 (d)));
4454 /* -------------------------------------------------------------------------------------------------
4455 * fast_composite_add_8000_8000
4459 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
4461 pixman_image_t * src_image,
4462 pixman_image_t * mask_image,
4463 pixman_image_t * dst_image,
4473 uint8_t *dst_line, *dst;
4474 uint8_t *src_line, *src;
4475 int dst_stride, src_stride;
4479 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4480 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4487 /* call prefetch hint to optimize cache load*/
4488 cache_prefetch ((__m128i*)src);
4489 cache_prefetch ((__m128i*)dst);
4491 dst_line += dst_stride;
4492 src_line += src_stride;
4496 while (w && (unsigned long)dst & 3)
4498 t = (*dst) + (*src++);
4499 *dst++ = t | (0 - (t >> 8));
4503 core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4513 t = (*dst) + (*src++);
4514 *dst++ = t | (0 - (t >> 8));
4522 /* -------------------------------------------------------------------------------------------------
4523 * fast_composite_add_8888_8888
4526 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4528 pixman_image_t * src_image,
4529 pixman_image_t * mask_image,
4530 pixman_image_t * dst_image,
4540 uint32_t *dst_line, *dst;
4541 uint32_t *src_line, *src;
4542 int dst_stride, src_stride;
4544 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4545 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4550 dst_line += dst_stride;
4552 src_line += src_stride;
4554 core_combine_add_u_sse2 (dst, src, NULL, width);
4560 /* -------------------------------------------------------------------------------------------------
4561 * sse2_composite_copy_area
4564 static pixman_bool_t
4565 pixman_blt_sse2 (uint32_t *src_bits,
4571 int src_x, int src_y,
4572 int dst_x, int dst_y,
4573 int width, int height)
4575 uint8_t * src_bytes;
4576 uint8_t * dst_bytes;
4579 if (src_bpp != dst_bpp)
4584 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4585 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4586 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4587 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4588 byte_width = 2 * width;
4592 else if (src_bpp == 32)
4594 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4595 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4596 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4597 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4598 byte_width = 4 * width;
4607 cache_prefetch ((__m128i*)src_bytes);
4608 cache_prefetch ((__m128i*)dst_bytes);
4613 uint8_t *s = src_bytes;
4614 uint8_t *d = dst_bytes;
4615 src_bytes += src_stride;
4616 dst_bytes += dst_stride;
4619 cache_prefetch_next ((__m128i*)s);
4620 cache_prefetch_next ((__m128i*)d);
4622 while (w >= 2 && ((unsigned long)d & 3))
4624 *(uint16_t *)d = *(uint16_t *)s;
4630 while (w >= 4 && ((unsigned long)d & 15))
4632 *(uint32_t *)d = *(uint32_t *)s;
4639 cache_prefetch_next ((__m128i*)s);
4640 cache_prefetch_next ((__m128i*)d);
4644 __m128i xmm0, xmm1, xmm2, xmm3;
4646 /* 128 bytes ahead */
4647 cache_prefetch (((__m128i*)s) + 8);
4648 cache_prefetch (((__m128i*)d) + 8);
4650 xmm0 = load_128_unaligned ((__m128i*)(s));
4651 xmm1 = load_128_unaligned ((__m128i*)(s+16));
4652 xmm2 = load_128_unaligned ((__m128i*)(s+32));
4653 xmm3 = load_128_unaligned ((__m128i*)(s+48));
4655 save_128_aligned ((__m128i*)(d), xmm0);
4656 save_128_aligned ((__m128i*)(d+16), xmm1);
4657 save_128_aligned ((__m128i*)(d+32), xmm2);
4658 save_128_aligned ((__m128i*)(d+48), xmm3);
4665 cache_prefetch_next ((__m128i*)s);
4666 cache_prefetch_next ((__m128i*)d);
4670 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4677 cache_prefetch_next ((__m128i*)s);
4678 cache_prefetch_next ((__m128i*)d);
4682 *(uint32_t *)d = *(uint32_t *)s;
4691 *(uint16_t *)d = *(uint16_t *)s;
4704 sse2_composite_copy_area (pixman_implementation_t *imp,
4706 pixman_image_t * src_image,
4707 pixman_image_t * mask_image,
4708 pixman_image_t * dst_image,
4718 pixman_blt_sse2 (src_image->bits.bits,
4719 dst_image->bits.bits,
4720 src_image->bits.rowstride,
4721 dst_image->bits.rowstride,
4722 PIXMAN_FORMAT_BPP (src_image->bits.format),
4723 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4724 src_x, src_y, dest_x, dest_y, width, height);
4728 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
4730 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4732 pixman_image_t * src_image,
4733 pixman_image_t * mask_image,
4734 pixman_image_t * dst_image,
4744 uint32_t *src, *src_line, s;
4745 uint32_t *dst, *dst_line, d;
4746 uint8_t *mask, *mask_line;
4748 int src_stride, mask_stride, dst_stride;
4751 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4752 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4753 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4755 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4756 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4757 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4762 src_line += src_stride;
4764 dst_line += dst_stride;
4766 mask_line += mask_stride;
4770 /* call prefetch hint to optimize cache load*/
4771 cache_prefetch ((__m128i*)src);
4772 cache_prefetch ((__m128i*)dst);
4773 cache_prefetch ((__m128i*)mask);
4775 while (w && (unsigned long)dst & 15)
4777 s = 0xff000000 | *src++;
4778 m = (uint32_t) *mask++;
4781 __m64 ms = unpack_32_1x64 (s);
4785 ms = in_over_1x64 (ms,
4787 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
4788 unpack_32_1x64 (d));
4791 *dst++ = pack_1x64_32 (ms);
4795 /* call prefetch hint to optimize cache load*/
4796 cache_prefetch ((__m128i*)src);
4797 cache_prefetch ((__m128i*)dst);
4798 cache_prefetch ((__m128i*)mask);
4802 /* fill cache line with next memory */
4803 cache_prefetch_next ((__m128i*)src);
4804 cache_prefetch_next ((__m128i*)dst);
4805 cache_prefetch_next ((__m128i*)mask);
4807 m = *(uint32_t*) mask;
4808 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
4810 if (m == 0xffffffff)
4812 save_128_aligned ((__m128i*)dst, xmm_src);
4816 xmm_dst = load_128_aligned ((__m128i*)dst);
4818 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4820 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4821 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4822 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4824 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4826 in_over_2x128 (xmm_src_lo, xmm_src_hi, mask_00ff, mask_00ff, xmm_mask_lo, xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
4828 save_128_aligned( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4839 m = (uint32_t) *mask++;
4843 s = 0xff000000 | *src;
4853 *dst = pack_1x64_32 (in_over_1x64 (unpack_32_1x64 (s),
4855 expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
4856 unpack_32_1x64 (d)));
4871 static const pixman_fast_path_t sse2_fast_paths[] =
4873 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 },
4874 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 },
4875 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 },
4876 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 },
4877 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 },
4878 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
4879 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
4880 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
4881 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
4882 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
4883 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
4884 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 },
4885 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 },
4886 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 },
4887 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 },
4889 /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
4890 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
4891 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
4892 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
4893 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
4895 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
4896 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
4897 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
4898 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
4899 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4900 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4901 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4902 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4903 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4904 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4905 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4906 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4907 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
4908 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
4909 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
4910 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
4911 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
4912 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
4913 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
4914 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
4915 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
4916 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
4917 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
4918 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
4919 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
4920 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
4921 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
4922 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
4924 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 },
4925 { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 },
4926 { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 },
4927 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_8888_8_8, 0 },
4929 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 },
4930 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 },
4931 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 },
4932 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 },
4933 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 },
4934 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 },
4935 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
4936 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
4937 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
4938 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
4939 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 },
4940 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 },
4942 { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 },
4943 { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 },
4949 * Work around GCC bug causing crashes in Mozilla with SSE2
4951 * When using -msse, gcc generates movdqa instructions assuming that
4952 * the stack is 16 byte aligned. Unfortunately some applications, such
4953 * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
4954 * causes the movdqa instructions to fail.
4956 * The __force_align_arg_pointer__ makes gcc generate a prologue that
4957 * realigns the stack pointer to 16 bytes.
4959 * On x86-64 this is not necessary because the standard ABI already
4960 * calls for a 16 byte aligned stack.
4962 * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
4964 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
4965 __attribute__((__force_align_arg_pointer__))
4968 sse2_composite (pixman_implementation_t *imp,
4970 pixman_image_t *src,
4971 pixman_image_t *mask,
4972 pixman_image_t *dest,
4982 if (_pixman_run_fast_path (sse2_fast_paths, imp,
4983 op, src, mask, dest,
4992 _pixman_implementation_composite (imp->delegate, op,
5000 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5001 __attribute__((__force_align_arg_pointer__))
5003 static pixman_bool_t
5004 sse2_blt (pixman_implementation_t *imp,
5011 int src_x, int src_y,
5012 int dst_x, int dst_y,
5013 int width, int height)
5015 if (!pixman_blt_sse2 (
5016 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5017 src_x, src_y, dst_x, dst_y, width, height))
5020 return _pixman_implementation_blt (
5022 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5023 src_x, src_y, dst_x, dst_y, width, height);
5029 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5030 __attribute__((__force_align_arg_pointer__))
5032 static pixman_bool_t
5033 sse2_fill (pixman_implementation_t *imp,
5043 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5045 return _pixman_implementation_fill (
5046 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5052 pixman_implementation_t *
5053 _pixman_implementation_create_sse2 (void)
5055 pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5056 pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5058 /* SSE2 constants */
5059 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5060 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5061 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5062 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5063 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5064 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5065 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5066 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5067 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
5068 mask_0080 = create_mask_16_128 (0x0080);
5069 mask_00ff = create_mask_16_128 (0x00ff);
5070 mask_0101 = create_mask_16_128 (0x0101);
5071 mask_ffff = create_mask_16_128 (0xffff);
5072 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5073 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5076 mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5077 mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5079 mask_x0080 = create_mask_16_64 (0x0080);
5080 mask_x00ff = create_mask_16_64 (0x00ff);
5081 mask_x0101 = create_mask_16_64 (0x0101);
5082 mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5086 /* Set up function pointers */
5088 /* SSE code patch for fbcompose.c */
5089 imp->combine_32[PIXMAN_OP_OVER] = sse2combine_over_u;
5090 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2combine_over_reverse_u;
5091 imp->combine_32[PIXMAN_OP_IN] = sse2combine_in_u;
5092 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2combine_in_reverse_u;
5093 imp->combine_32[PIXMAN_OP_OUT] = sse2combine_out_u;
5094 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2combine_out_reverse_u;
5095 imp->combine_32[PIXMAN_OP_ATOP] = sse2combine_atop_u;
5096 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2combine_atop_reverse_u;
5097 imp->combine_32[PIXMAN_OP_XOR] = sse2combine_xor_u;
5098 imp->combine_32[PIXMAN_OP_ADD] = sse2combine_add_u;
5100 imp->combine_32[PIXMAN_OP_SATURATE] = sse2combine_saturate_u;
5102 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2combine_src_c;
5103 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2combine_over_c;
5104 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2combine_over_reverse_c;
5105 imp->combine_32_ca[PIXMAN_OP_IN] = sse2combine_in_c;
5106 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2combine_in_reverse_c;
5107 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2combine_out_c;
5108 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2combine_out_reverse_c;
5109 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2combine_atop_c;
5110 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2combine_atop_reverse_c;
5111 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2combine_xor_c;
5112 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2combine_add_c;
5114 imp->composite = sse2_composite;
5115 imp->blt = sse2_blt;
5116 imp->fill = sse2_fill;
5121 #endif /* USE_SSE2 */